diff --git a/examples/Text Normalization using NLPurify.ipynb b/examples/Text Normalization using NLPurify.ipynb new file mode 100644 index 0000000..df319b5 --- /dev/null +++ b/examples/Text Normalization using NLPurify.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4a37fef", + "metadata": {}, + "source": [ + "

Text Normalization

\n", + "\n", + "---\n", + "In the world of Natural Language Processing (NLP), we work with human language. However, human language is inherently messy, varied, and full of nuances that can be confusing for computers. Text normalization is the foundational process of cleaning and standardizing raw text into a consistent, predictable format. Think of it as tidying up a chaotic room before you can find anything; we are tidying up language so a machine learning model can understand it.\n", + "\n", + "The primary goal is to reduce the randomness in text by grouping different variations of a word or phrase into a single, canonical form. For example, to a computer, the words \"run,\" \"Run,\" and \"running\" are three distinct items. Text normalization ensures these are all recognized as the same core concept, simplifying the data for NLP models. This preprocessing step is crucial for the success of almost all major NLP tasks, including search engines, sentiment analysis, and machine translation.\n", + "\n", + "**Why is it so Important?**\n", + "\n", + " * **Improved Model Performance:** Clean, standardized data helps models learn more effectively, leading to higher accuracy.\n", + " * **Reduced Complexity:** It significantly shrinks the vocabulary the model needs to learn, which reduces computational costs and memory usage.\n", + " * **Enhanced Feature Extraction:** When different forms of a word are treated as a single feature, the statistical power of that feature increases, leading to better insights." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "526847b9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:13.253923Z", + "start_time": "2025-10-23T09:26:13.230466Z" + } + }, + "outputs": [], + "source": [ + "import os # miscellaneous os interfaces\n", + "import sys # configuring python runtime environment" + ] + }, + { + "cell_type": "markdown", + "id": "e7157cc7", + "metadata": {}, + "source": [ + "## NLP Libraries\n", + "\n", + "Python offers a rich ecosystem of libraries for Natural Language Processing (NLP), catering to various needs from foundational tasks to advanced deep learning models. Here are some of the most prominent ones:\n", + "\n", + " 1. [NLTK](https://www.nltk.org/) Natural Language Toolkit - a comprehensive library for foundational NLP tasks like tokenization, stemming, lemmatization, etc.\n", + " 2. [spaCy](https://spacy.io/) Industrial-Strength NLP - designed for production-level applications, emphasizing speed and efficiency." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f6f328a9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:13.269218Z", + "start_time": "2025-10-23T09:26:13.255910Z" + } + }, + "outputs": [], + "source": [ + "# import nltk" + ] + }, + { + "cell_type": "markdown", + "id": "4ab6f85c", + "metadata": {}, + "source": [ + "### NLPurify\n", + "\n", + "A text cleaning and extraction engine was developed using a combination of traditional techniques like Unicode translations, cleaning using regular expressions, and modern tools like \"natural language processing\"\n", + "and \"large language models\" to detect and clean long texts and create word vectors. The library is developed as an one-stop solution that modifies and collates the utility functions to provide common things at one place." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7afba9c9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:20.019613Z", + "start_time": "2025-10-23T09:26:13.274467Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Version: v2.1.0.dev0\n" + ] + } + ], + "source": [ + "import nlpurify as nlpu\n", + "\n", + "# general convention is to assign the short form ``nlpu`` to the library\n", + "# print the current version of the library - for debugging and documentation\n", + "print(f\"Current Version: {nlpu.__version__}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4e3479e9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:27:21.225941Z", + "start_time": "2025-10-23T09:27:21.218991Z" + } + }, + "outputs": [], + "source": [ + "text = '''\n", + " This is a uncLeaneD text with lots of\n", + " extra WHITE \n", + "space.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e4d2fc66", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:37:55.741806Z", + "start_time": "2025-10-23T09:37:55.722247Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized White Space: `This is a uncLeaneD text with lots of extra WHITE space.`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.WhiteSpace()\n", + "print(f\"Normalized White Space: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d405780a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:42:40.080806Z", + "start_time": "2025-10-23T09:42:40.061140Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `\n", + " this is a uncleaned text with lots of\n", + " extra white \n", + "space.\n", + "`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.CaseFolding()\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "9f4aa5e6", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:21.586061Z", + "start_time": "2025-10-23T09:46:21.572034Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `This uncLeaneD text lots extra WHITE space .`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.StopWords()\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ca00225a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:22.408174Z", + "start_time": "2025-10-23T09:46:22.400677Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `['This', 'is', 'a', 'uncLeaneD', 'text', 'with', 'lots', 'extra', 'WHITE']`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.utils.WordTokenize(vanilla = True, tokenizer = False, vanilla_getalnum = True)\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0a2d28cf", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:24.121050Z", + "start_time": "2025-10-23T09:46:24.101463Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UNCLEANED TEXT LOTS EXTRA WHITE SPACE .\n" + ] + } + ], + "source": [ + "print(nlpu.preprocessing.normalization.normalize(text, upper = True, stopwords_in_uppercase = True))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TensorFlow CPU (v2.12.0)", + "language": "python", + "name": "tensorflow" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py index e428c76..0579d0b 100644 --- a/nlpurify/__init__.py +++ b/nlpurify/__init__.py @@ -20,14 +20,10 @@ __version__ = "v2.1.0.dev0" # init-time options registrations +from nlpurify import preprocessing + from nlpurify.scoring import fuzzy from nlpurify.scoring import regexp -from nlpurify.feature import ( - selection as feature_selection -) - -from nlpurify.normalization import ( - normalize, - strip_whitespace -) +from nlpurify.feature import selection as feature_selection +from nlpurify.feature import extraction as feature_extraction diff --git a/nlpurify/feature/selection/__init__.py b/nlpurify/feature/selection/__init__.py index 4188f03..7c073e9 100644 --- a/nlpurify/feature/selection/__init__.py +++ b/nlpurify/feature/selection/__init__.py @@ -3,5 +3,3 @@ """ Selection of Finite Set of Features/Tokens for Efficient Modelling """ - -from nlpurify.feature.selection.nltk import * # noqa: F401, F403 # pyright: ignore[reportMissingImports] diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py deleted file mode 100644 index 5f25759..0000000 --- a/nlpurify/feature/selection/nltk.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- encoding: utf-8 -*- - -""" -A Collection of Methods from the Natural Language Toolkit (NLTK) - -NLTK is a leading suite of library designed for symbolic and -statistical natural language program in Python. This module is -designed for the English language, however, the due to continued -effort from the community other languages are being incorporated. - -A set of functions are designed using the NLTK library for feature -selection methods like stop words removal, word lemmatizations etc. -A feature selection method works best when a text is normalized. -This can be achived by using :mod:`nlpurify.normalizeText()` method -and is generally internally by all the related functions defined. -""" - -import re - -from typing import Union - -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize - -from nlpurify.normalization import normalize - -def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **kwargs) -> list: - """ - Tokenization of Text into Lists of Substrings - - A word vector is one where a scentence is broken down into small - pieces (or vectors) which constitutes the features of an model. - To achieve tokenization, the most simpler approach is by using the - in-built string method ``text.split()`` which splits the string - by white characters. However, this is often in efficient and - results in an improper model development. This can be resolved by - using the :mod:`nltk.tokenize` methods. - - By default, the function is tuned with the ``word_tokenize`` - method which works as below: - - .. code-block:: python - - from nltk.tokenize import word_tokenize - s = '''Good muffins cost $3.88\\nin New York. Please buy me - two of them.\\n\\nThanks.''' - - word_tokenize(s) - >> ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', - 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', - '.', 'Thanks', '.'] - - The power of tokenization is appreciated even more when the text - data is cleaned and normalized as defined in the function - :func:`nlpurify.feature.selection.nltk.remove_stopwords`. - - :type text: str - :param text: The raw text which is internally normalized using - :func:`nlpurify.normalizeText()` for feature selection. To - stop normalization of text see the keyword arguments section. - - :type vanilla: bool - :param vanilla: Override the :func:`nltk.tokenize.word_tokenize` - function with Python vanilla method. Either the method allows - setting ``regexp == True`` or ``vanilla == True`` while both - are not allowed. The vanilla method uses string attributes - like ``.split()`` and other keyword arguments to control - and tokenize the data. - - :type regexp: bool - :param regexp: Override the :func:`nltk.tokenize.word_tokenize` - function with a regular expressions. Either the method allows - setting ``regexp == True`` or ``vanilla == True`` while both - are not allowed. - - **Keyword Arguments** - - The default keyword arguments are defined for the - :func:`nltk.tokenize.word_tokenize` function. - - * **preserve_line** (*bool*): A flag to decide whether to - sentence tokenize the text or not, as accepted by - the function. Defaults to False. - - * **tokenize_language** (*str*): The language model name as - accepted by the Punkt corpus by NLTK. Defaults to the - "english" language, as in function. - - The paramter value associated with regular expression data - control is as below: - - * **expression** (*str*): The regular expression which - is used to compile the regular expression. This should - be a r-string which can be directly used. Defaults to - ``r"\w+"` value, i.e. only word characters. - - The paramter value associated with the Python vanilla method - control is as below: - - * **split_by** (*str*): The value is passed to :func:`split()` - to control seperated terms, defaults to white space. - - * **retalpha** (*bool*): If set to True (default) returns only - alphabets which is an inherent string property: ``isalpha`` - method. If both ``retalpha`` and ``retalnum`` is true, then - ``retalpha`` has an overriding effect. - - * **retalnum** (*bool*): If set to True (default) returns only - alphabets and numeric characters from the string which uses - an inherent string property: ``isalnum`` method. - - **Function Example** - - The function primarily returns a list of strings which can be - used to create word vector. - - .. code-block:: python - - s = "this is an example string, with p()nct & n0s." - - # using the default word_tokenize: - print(nlpurify.feature_selection.tokenize_text(s)) - >> ['this', 'is', 'an', 'example', 'string', ',', 'with', 'p', '(', ')', 'nct', '&', 'n0s', '.'] - - # using regular expressions, default configuration - print(nlpurify.feature_selection.tokenize_text(s, regexp = True)) - >> ['this', 'is', 'an', 'example', 'string', 'with', 'p', 'nct', 'n0s'] - - # a custom regular expressions is also accepted, feel free to experiment! - # the following expression mimics the above example r"\w+" but using custom expression - print(nlpurify.feature_selection.tokenize_text(s, regexp = True, expression = r"[a-zA-Z0-9_]+")) - >> ['this', 'is', 'an', 'example', 'string', 'with', 'p', 'nct', 'n0s'] - - # this is using vanilla python string functions with default values - print(nlpurify.feature_selection.tokenize_text(s, vanilla = True)) - >> ['this', 'is', 'an', 'example', 'with'] - - # to understand the difference using retalpha == False, i.e., retalnum = True - s = "this is an example string, with p()nct & n0s. 987" - print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False)) - >> ['this', 'is', 'an', 'example', 'with', '987'] - - **Error Guidelines** - - :raises ValueError: The error is raised when both the attribute - ``vanilla`` and ``regexp`` is set to True. - - :raises ImportError: Error is raised when one or more nltk corpus - is not available in the system. - - **Return Type** - - :rtype: list[str] - :return: Returns a tokenized list of strings. To represent and - save the same in a tabular format use ``"".join()`` method. - """ - - preserve_line = kwargs.get("preserve_line", False) - tokenize_language = kwargs.get("tokenize_language", "english") - - expression = re.compile(kwargs.get("expression", r"\w+")) - - split_by = kwargs.get("split_by", " ") - retalpha = kwargs.get("retalpha", True) - retalnum = kwargs.get("retalnum", True) - - tokenize_method = None - # ! do not allow both `regexp` and `vanilla` as true - if all([regexp, vanilla]): - raise ValueError("Both Control are Not Allowed.") - elif any([regexp, vanilla]): - # ? if the error is not raised, execute below - else section - if regexp: - tokenize_method = "regexp" - else: - # ? vanilla method is selected, however based - # on keyword selection - setting different index - if retalpha: - # this has an ovverriding effect thus first - tokenize_method = "vanilla-alnum" - elif retalnum: - tokenize_method = "vanilla-alpha" - else: - tokenize_method = "vanilla-vanilla" - else: - tokenize_method = "word-tokenize" - - # select method and parameter control, and finally - # return token data value from the choice of method: - tokens = { - "regexp" : expression.findall(text), - - # we've the three methods of vanilla like: - "vanilla-alnum" : [s for s in text.split(split_by) if s.isalnum()], - "vanilla-alpha" : [s for s in text.split(split_by) if s.isalpha()], - "vanilla-vanilla" : text.split(split_by), # default, no change - - "word-tokenize" : word_tokenize(text, language = tokenize_language, preserve_line = preserve_line) - } - - return tokens[tokenize_method] - - -def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> str | list: - """ - Function to Remove Stopwods from a Raw Text using NLTK - - In a Natural Language Processing (NLP), stopwords are frequently - removed to improve the performance of the model and increase the - computational efficiency. Words like "and", "or", etc. does not - provide additional information to the model but are important for - communications. - - The NLTK library hosts a lists of words that should be removed. - To find the list of supported language, check the fields: - - .. code-block:: python - - from nltk.corpus import stopwords - print(stopwords.fileids()) # list of supported language - - Stopwords is available under :mod:`nltk.corpus` which is required - by the function. To download a corpus: - - .. code-block:: python - - import nltk - nltk.download("all") # download all the available corpus - nltk.download("stopwords") # only download stopwrds corpus - - While it is advisable to download all the corpus, but it is - left on the discretion of the user. The function may throw an - error in case there are additional dependent corpus which needs - to be downloaded. - - :type text: str - :param text: The raw text which is internally normalized using - :func:`nlpurify.normalizeText()` for feature selection. To - stop normalization of text see the keyword arguments section. - - :type language: str - :param language: The name of the language which is available - under the :mod:`nltk.corpus.stopwords`. To find the list - of accepted languages check ``stopwords.fields()``. Defaults - to the "english" language. - - **Keyword Arguments** - - * **tokenize** (*bool*): Tokenize the text using the - :mod:`nltk.tokenize.word_tokenize()` method to extract the - tokens from the strings which returns the syllables from - a single word. Defaults to True. If false, the function - internally creates a token vector using ``text.split()`` - method, which splits the words by spaces - which may - have unwanted effect when symbols are present in the text. - Check attributes of :func:`tokenize()` for more details. - - * **normalize** (*bool*): Normalize the text internally. - Defaults to true, else if the data is already normalized - then pass False. Note the function may not be able to - remove all the stopwords if the data is not normalized and - the case of the words is not lower. - - **Function Example** - - For more control over the tokenization, all the parameters - of :func:`tokenize_text()` is accepted. - - .. code-block:: python - - s = "this is an example string, with p()nct & n0s." - - # using defaults from tokenize_text, i.e., using word_tokenize - print(nlpurify.feature_selection.remove_stopwords(s)) - >> example string , p ( ) nct & n0s . - - # this we can further simplify by using other features - print(nlpurify.feature_selection.remove_stopwords(s, regexp = True)) - >> example string p nct n0s - - **Error Guidelines** - - :raises ValueError: The error is raised when the return type is - not in {str, list} values. Make sure the data type is an type - instance and is not passed as a string value. - - :raises ImportError: Error is raised when one or more nltk corpus - is not available in the system. - - **Return Type** - - :rtype: str | list - :return: A cleaned string or a vector (*iterable*) of selected - features from a given text message. - """ - - tokenize_ = kwargs.get("tokenize", True) - normalize_ = kwargs.get("normalize", True) - - stopwords_ = stopwords.words(language) # defaults to english - - # ? normalize the text using nlpurify.normalizeText() - # else, left at user's discreations or additional functionalities - text = normalize(text, **kwargs) if normalize_ else text - - tokens = tokenize_text(text, **kwargs) if tokenize_ else text - tokens = [word for word in tokens if word not in stopwords_] - - # ensure return type of the data, else raise error - if rtype not in [str, list]: - raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.") - - return " ".join(tokens) if rtype == str else tokens diff --git a/nlpurify/normalization.py b/nlpurify/normalization.py deleted file mode 100644 index c26989b..0000000 --- a/nlpurify/normalization.py +++ /dev/null @@ -1,174 +0,0 @@ -# -*- encoding: utf-8 -*- - -""" -Module Involved to Normalization of Text - -The normalization of text involves cleaning of text/strings from -unwanted characters like double spacing, double line breaks to single -line breaks, etc. A single functional approach is designed to handle -all such user's requests. -""" - -import os -import re - -def strip_whitespace(text : str, **kwargs) -> str: - """ - Normalize Whitespaces in a Text Data - - Cleaning texts of white spaces like from beginning, end, and - also multiple white spaces does not add any value to a text and - should thus be removed to normalize the text. - - :type text: str - :param text: Original string which needs to be cleaned of - white spaces. - - Keyword Arguments - ----------------- - - The function now provides the following additional keyword - arguments for control: - - * **lstrip** (*bool*): Left strip white space from the - provided text. Defaults to True. Setting any of the value - to ``False`` overrides the default ``.strip()` function. - * **rstrip** (*bool*): Right strip white space from the - provided text. Defaults to True. Setting any of the value - to ``False`` overrides the default ``.strip()` function. - * **multiple_whitespace** (*bool*): Delete multiple spaces - from the text. This uses the pattern cleaning using - regular expression. Defaults to True. - - Example(s) & Use Case(s) - ------------------------ - - The function can be used to return a clean string of white spaces - as per user requirement: - - .. code-block:: python - - statement = " this is an example string with white space " - - # example of default behavior - remove all abnormal spaces:: - print(f"`{nlpurify.strip_whitespace(statement)}`") - >>> `this is an example string with white space` - - # example of using either lstrip/rstrip/none as keywords - print(f"`{nlpurify.strip_whitespace(statement, lstrip = False)}`") - >>> ` this is an example string with white space` - - # example of setting multiple_whitespace - print(f"`{nlpurify.strip_whitespace(statement, multiple_whitespace = False)}`") - >>> `this is an example string with white space` - - :rtype: str - :return: Return a cleaner version of string free of white - characters as per user requirement. - """ - - lstrip = kwargs.get("lstrip", True) - rstrip = kwargs.get("rstrip", True) - multiple_whitespace = kwargs.get("multiple_whitespace", True) - - if all([lstrip, rstrip]): - # when both the condition is true, then default to `.strip()` - text = text.strip() - else: - # we cannot use the default strip function and should be - # handled seperately using each conditional statement - text = text.lstrip() if lstrip else text.rstrip() if rstrip else text - - # clean the text of multiple white spaces using regular expression - pattern = re.compile(r"\s+") # one or more white space character - text = pattern.sub(" ", text) if multiple_whitespace else text - - return text - - -def normalize(text : str, strip : bool = True, **kwargs) -> str: - """ - Normalize a Text for AI/ML Operations to Reduce Randomness - - The normalization function uses the in-built string function like - :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner - version. The following arguments are available for more control. - A normalized texts may have the following properties: - - * It may not start or end with a white space character, - * It may not have double space instead of single space, and - * It may not be spread across multiple lines (i.e., paragraphs). - - All the above properties are desired, and can improve performance - when used to train a large language model. Normalizaton of texts - may also involve uniform case, typically :attr:`.lower()` that - can be used to create a word vector. - - :type text: str - :param text: The base uncleaned text, all the operations are - done on this text to return a cleaner version. The string can - be single line, multi-line (example from "text area") and can - have any type of escape characters. - - :type strip: bool - :param strip: The global attribute to clean and normalize text - of white spaces and multiple line breaks. - - Keyword Arguments - ----------------- - - All the arguments of :func:`nlpurify.normalize.strip_whitespace()` - is accepted. In addition, the following are specific to this - function: - - * **strip_line_breaks** (*bool*): Strip line breaks and - returns a single line statement. This uses the os default - which is either "CR LF" for windows or "LF" for *nix - based systems. However, the default value can be override - using keyword argument :attr:`line_break_seperator`. - Defaults to True. - - * **line_break_seperator** (*str*): The end line character - which is either "\\r\\n" for windows or "\\n" for *nix - based systems. By default defaults to running operating - systems default. - - * **strip_tab_space** (*bool*): Strip a line of tab character, - defaults to True. - - Example(s) & Use Case(s) - ------------------------ - - The function returns all scentence to default lower case, and - strips the text filed of white spaces and multiple lines into one - single scentence. - - .. code-block:: python - - statement = ''' - thIs Is an example string with \t\nwhite space - - loreememm ipsum dolor - - ''' - - # default behavior removes all into single statement - print(f"`{nlpurify.normalize(statement)}`") - >>> `this is an example string with white space loreememm ipsum dolor` - - :rtype: str - :return: Return a cleaner version of string free of white - characters as per user requirement. - """ - - line_break_seperator = kwargs.get("line_break_seperator", os.linesep) - - # normalize text of line breaks based on os/user defined - text = text.replace(line_break_seperator, " ") \ - if kwargs.get("strip_line_breaks", True) else text - text = text.replace(line_break_seperator, " ") \ - if kwargs.get("strip_tab_space", True) else text - - # ! 💣 always return the text in lowercase instead of user choice - # in addition, run the white space removal logic to normalize the text - return strip_whitespace(text, **kwargs).lower() if strip else text.lower() diff --git a/nlpurify/preprocessing/__init__.py b/nlpurify/preprocessing/__init__.py index 816787e..be0a62c 100644 --- a/nlpurify/preprocessing/__init__.py +++ b/nlpurify/preprocessing/__init__.py @@ -3,3 +3,6 @@ """ Utility Tools to Convert Raw Texts into a Structured Format """ + +from nlpurify.preprocessing import utils +from nlpurify.preprocessing import normalization diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py new file mode 100644 index 0000000..9c98dbe --- /dev/null +++ b/nlpurify/preprocessing/normalization.py @@ -0,0 +1,246 @@ +# -*- encoding: utf-8 -*- + +""" +Text normalization is the process of converting text into a +consistent, standard, or "canonical" form. The goal is to reduce +randomness and variations in the text data, which helps in reducing +the overall number of unique words (the vocabulary size) and ensures +that different forms of the same word are treated as one. + +The main goal is to provide a single function that can be used to +achieve normalization goals - popular methods are text cases (setting +lower or upper case to all the words), stopwords removal etc. + +.. code-block:: python + + import NLPurify as nlpu + + ... + text = " My unCleaned text!! " + print(nlpu.preprocessing.normalize(text, ...)) + >> "my uncleaned text" # example of a cleaned text + +The core methods is kept simple, and generic arguments are used which +are widely recognized/used by popular libraries. +""" + +import os +import re + +from pydantic import BaseModel +from abc import ABC, abstractmethod + +from nltk.corpus import stopwords + +from nlpurify.preprocessing.utils import WordTokenize + +class _base_normalize(BaseModel, ABC): + """ + Base Settings for Text Normalization with Field Validation + """ + + @abstractmethod + def apply(self, text : str) -> str: + pass + + +class WhiteSpace(_base_normalize): + """ + A Model to Normalize White Space (space, tabs, newlines) from Text + + Cleaning texts of white spaces like from beginning, end, and + also multiple white spaces does not add any value to a text and + should thus be removed to normalize the text. + + :param strip, lstrip, rstrip: Settings to strip white spaces from + beginning or end of the string for normalization. By default, + all the spaces are removed as they do not provide any + additional information and is mostly an error in typing text. + + :param newline: Strip new line characters from a multiple line + (i.e., a paragraph or text from "text area") to get one single + text, defaults to True. + + :param newlinesep: A string value which defaults to the systems' + default new line seperator ("\r\n" `CRLF` for windows, and + "\n" `LF` for *nix based systems) to replace from string. + + :param multispace: Replace multiple spaces which often reduces the + models' performance, defaults to True. + + A modular approach is now enabled which is derived from a base + normalization class. The usage is as below: + + .. code-block:: python + + import nlpurify as nlpu + model = nlpu.preprocessing.normalization.WhiteSpace() + + # let's define a multi-line uncleaned text + text = ''' + This is a uncleaned text with lots of + extra white + space. + ''' + + print(model.apply(text)) # uses default settings + >> This is a uncleaned text with lots of extra white space. + + The model does not accept additional arguments and the function + ``.apply()`` is used to clean and normalize white space from text. + """ + + strip : bool = True + lstrip : bool = True + rstrip : bool = True + newline : bool = True + + # ? if new line is true, then also allow to provide new line + # which defaults to the operating system default + newlinesep : str = os.linesep + + # ? remove multiple whitespace - uses regual expressions + multispace : bool = True + + + def apply(self, text : str) -> str: + pattern = re.compile(r"\s+") # one/more white spaces + + # first - strip the white space from beginning and end of text + if self.strip: + text = text.strip() + elif self.lstrip: + text = text.lstrip() + elif self.rstrip: + text = text.rstrip() + else: + pass # no strip processing + + # second, remove new line characters from the text + text = text.replace(self.newlinesep, " ") if self.newline \ + else text + + # third remove multiple white spaces from the string + text = pattern.sub(" ", text) if self.multispace else text + + return text + + +class CaseFolding(_base_normalize): + upper : bool = False + lower : bool = True + + def apply(self, text : str) -> str: + return text.upper() if self.upper else text.lower() if \ + text.lower() else text + + +class StopWords(_base_normalize): + language : str = "english" + extrawords : list = [] + + # ! by default, nltk library provides stopwords in lower case + # however, we can override and set the value as per our case needs + stopwords_in_uppercase : bool = False + + # ! removal of stop words is associated with word tokenization + tokenize : bool = True + tokenize_config : WordTokenize = WordTokenize() + + + def apply(self, text : str) -> str: + stopwords_ = stopwords.words(self.language) + self.extrawords + tokenized_ = self.tokenize_config.apply(text) \ + if self.tokenize else text.split() + + # case folding of stopwords in upper/lower case as per need + stopwords_ = list(map( + lambda x : x.upper(), stopwords_ + )) if self.stopwords_in_uppercase else stopwords_ + + return " ".join([ + word for word in tokenized_ if word not in stopwords_ + ]) + + +def normalize( + text : str, + whitespace : bool = True, + casefolding : bool = True, + stopwords : bool = True, + **kwargs + ) -> str: + """ + The normalization function uses the in-built string function like + :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner + version. The following arguments are available for more control. + A normalized texts may have the following properties: + + * It may not start or end with a white space character, + * It may not have multiple spaces or spaces in the beginning + or end of the scentence, and + * It may not be spread in multiple lines (i.e., paragraph). + + All the above properties are desired, and can improve performance + when used to train a large language model. Normalizaton of texts + may also involve uniform case, typically :attr:`.lower()` that + can be used to create a word vector. + + :type text: str + :param text: The base uncleaned text, all the operations are + done on this text to return a cleaner version. The string can + be single line, multi-line (example from "text area") and can + have any type of escape characters. + + All the normalization techniques are put into one callable method + which in turn uses ``pydantic`` models for data validation and + settings management of each technique. + + :type whitespace: bool + :param whitespace: A technique that normalizes the white space + from the underlying texts. A text with multiple white spaces + increases the processing load of a NLP/LLM model that can hurt + performance. White spaces in a text includes spaces, tabs and + new lines which is the primary delimiter of a NLP/LLM model. + + :type casefolding: bool + :param casefolding: Technique to normalize cases from a string to + a desired format, i.e., either all caps or all in small case. + It is always a good practice to convert all the raw text into + small case and then send for further modeling. + + Keyword Arguments + ----------------- + + The keyword arguments are used to toggle on/off each of the + normalization techniques. Each technique is associated with an + underlying dictionary which is defined under respective models. + + :rtype: str + :return: Return a cleaner version of string free of white + characters as per user requirement. + """ + + whitespace_model = WhiteSpace(**{ + k : kwargs.get(k, WhiteSpace.model_fields[k].default) + for k in list(WhiteSpace.model_fields.keys()) + if k in kwargs.keys() + }) + + casefolding_model = CaseFolding(**{ + k : kwargs.get(k, CaseFolding.model_fields[k].default) + for k in list(CaseFolding.model_fields.keys()) + if k in kwargs.keys() + }) + + stopwords_model = StopWords(**{ + k : kwargs.get(k, StopWords.model_fields[k].default) + for k in list(StopWords.model_fields.keys()) + if k in kwargs.keys() + }) + + text = whitespace_model.apply(text) if whitespace else text + text = casefolding_model.apply(text) if casefolding else text + text = stopwords_model.apply(text) if stopwords else text + + return text diff --git a/nlpurify/preprocessing/utils.py b/nlpurify/preprocessing/utils.py new file mode 100644 index 0000000..0231093 --- /dev/null +++ b/nlpurify/preprocessing/utils.py @@ -0,0 +1,78 @@ +# -*- encoding: utf-8 -*- + +""" +Utility Functions for Text Preprocessings +""" + +import re + +from pydantic import BaseModel +from nltk.tokenize import word_tokenize + +class WordTokenize(BaseModel): + """ + Tokenize text into word vectors using different types of methods + to achieve cleaner text in desired formats. + + :param regexp, vanilla, tokenizer: Selection methods for different + tokenization techniques. Set the value to ``regexp = True`` to + tokenize text using regular expressions, for using pure Python + based text tokenization use the ``vanilla = True`` method, and + ``tokenizer = True`` (default) is for using external tokenizer + functions like :func:`nltk.tokenize.word_tokenize` methods. + The function will throw error if all of the values are set to + true, and only one can be true at a time. + """ + + regexp : bool = False + vanilla : bool = False + tokenizer : bool = True + + # ? additional settings for regular expressions + regexp_pattern : str = r"\w+" + + # ? additional settings for vanilla based methods + vanilla_split_by : str = " " + vanilla_getalpha : bool = False + vanilla_getalnum : bool = False + + # ? additional settings for tokenizer based method + tokenizer_language : str = "english" + tokenizer_preserve_line : bool = False + + + def apply(self, text : str) -> str: + method = "regexp" if self.regexp else "vanilla" \ + if self.vanilla else "tokenizer" if self.tokenizer \ + else None # none should not be generated, validated values + + if method == "regexp": + expression = re.compile(self.regexp_pattern) + text = expression.findall(text) + elif method == "vanilla": + submethod = "retalpha" if self.vanilla_getalpha \ + else "retalnum" if self.vanilla_getalnum else None + + _functions = { + "retalpha" : [ + token for token in text.split(self.vanilla_split_by) + if token.isalpha() + ], + + "retalnum" : [ + token for token in text.split(self.vanilla_split_by) + if token.isalnum() + ] + } + + text = _functions.get(submethod, text) + elif method == "tokenizer": + text = word_tokenize( + text, + language = self.tokenizer_language, + preserve_line = self.tokenizer_preserve_line + ) + else: + pass + + return text