From 4986a862dbe8e031ad81d0499eb814fa7011c9da Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 22 Oct 2025 15:26:37 +0530 Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=92=A3=20normalization=20is=20now=20p?= =?UTF-8?q?art=20of=20nlp-utils/preprocessing=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - grouping of similar functions into submodule is the logical approach - normalization of text is part of pre-processing of raw texts --- nlpurify/__init__.py | 2 +- nlpurify/feature/selection/nltk.py | 2 +- nlpurify/{ => preprocessing}/normalization.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename nlpurify/{ => preprocessing}/normalization.py (100%) diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py index e428c76..f273363 100644 --- a/nlpurify/__init__.py +++ b/nlpurify/__init__.py @@ -27,7 +27,7 @@ selection as feature_selection ) -from nlpurify.normalization import ( +from nlpurify.preprocessing.normalization import ( normalize, strip_whitespace ) diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py index 5f25759..4c8299e 100644 --- a/nlpurify/feature/selection/nltk.py +++ b/nlpurify/feature/selection/nltk.py @@ -22,7 +22,7 @@ from nltk.corpus import stopwords from nltk.tokenize import word_tokenize -from nlpurify.normalization import normalize +from nlpurify.preprocessing.normalization import normalize def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **kwargs) -> list: """ diff --git a/nlpurify/normalization.py b/nlpurify/preprocessing/normalization.py similarity index 100% rename from nlpurify/normalization.py rename to nlpurify/preprocessing/normalization.py From a0975d3eac14a3711d7d105f5bd43cccfb9ebdb2 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 22 Oct 2025 17:15:34 +0530 Subject: [PATCH 2/7] =?UTF-8?q?=F0=9F=A9=B9=F0=9F=9A=A7=20patching=20norma?= =?UTF-8?q?lization=20process=20with=20pydantic=20and=20abc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nlpurify/preprocessing/__init__.py | 2 + nlpurify/preprocessing/normalization.py | 71 ++++++++++++++++++++----- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/nlpurify/preprocessing/__init__.py b/nlpurify/preprocessing/__init__.py index 816787e..fbb5be6 100644 --- a/nlpurify/preprocessing/__init__.py +++ b/nlpurify/preprocessing/__init__.py @@ -3,3 +3,5 @@ """ Utility Tools to Convert Raw Texts into a Structured Format """ + +from nlpurify.preprocessing.normalization import normalize diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py index c26989b..134d7fe 100644 --- a/nlpurify/preprocessing/normalization.py +++ b/nlpurify/preprocessing/normalization.py @@ -1,17 +1,54 @@ # -*- encoding: utf-8 -*- """ -Module Involved to Normalization of Text +Text normalization is the process of converting text into a +consistent, standard, or "canonical" form. The goal is to reduce +randomness and variations in the text data, which helps in reducing +the overall number of unique words (the vocabulary size) and ensures +that different forms of the same word are treated as one. -The normalization of text involves cleaning of text/strings from -unwanted characters like double spacing, double line breaks to single -line breaks, etc. A single functional approach is designed to handle -all such user's requests. +The main goal is to provide a single function that can be used to +achieve normalization goals - popular methods are text cases (setting +lower or upper case to all the words), stopwords removal etc. + +.. code-block:: python + + import NLPurify as nlpu + + ... + text = " My unCleaned text!! " + print(nlpu.preprocessing.normalize(text, ...)) + >> "my uncleaned text" # example of a cleaned text + +The core methods is kept simple, and generic arguments are used which +are widely recognized/used by popular libraries. """ import os import re +from pydantic import BaseModel +from abc import ABC, abstractmethod + +class BaseSettings(BaseModel, ABC): + """ + Base Settings for Text Normalization with Field Validation + """ + + @abstractmethod + def apply(self, text : str) -> str: + pass + + +def WhiteSpace(BaseSettings): + strip : bool = True + lstrip : bool = True + rstrip : bool = True + + + def apply(self, text : str) -> str: + return text.strip() + def strip_whitespace(text : str, **kwargs) -> str: """ Normalize Whitespaces in a Text Data @@ -88,16 +125,15 @@ def strip_whitespace(text : str, **kwargs) -> str: def normalize(text : str, strip : bool = True, **kwargs) -> str: """ - Normalize a Text for AI/ML Operations to Reduce Randomness - The normalization function uses the in-built string function like :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner version. The following arguments are available for more control. A normalized texts may have the following properties: * It may not start or end with a white space character, - * It may not have double space instead of single space, and - * It may not be spread across multiple lines (i.e., paragraphs). + * It may not have multiple spaces or spaces in the beginning + or end of the scentence, and + * It may not be spread in multiple lines (i.e., paragraph). All the above properties are desired, and can improve performance when used to train a large language model. Normalizaton of texts @@ -110,13 +146,24 @@ def normalize(text : str, strip : bool = True, **kwargs) -> str: be single line, multi-line (example from "text area") and can have any type of escape characters. - :type strip: bool - :param strip: The global attribute to clean and normalize text - of white spaces and multiple line breaks. + All the normalization techniques are put into one callable method + which in turn uses ``pydantic`` models for data validation and + settings management of each technique. Keyword Arguments ----------------- + The keyword arguments are used to toggle on/off each of the + normalization techniques. Each technique is associated with an + underlying dictionary which is defined under respective models. + + * **whitespace** (*WhiteSpace*): A normalization technique + that normalizes the white space from the underlying texts. A + text with multiple white spaces increases the processing + load of a NLP/LLM model that can hurt performance. White + spaces in a text includes spaces, tabs and new lines which + is the primary delimiter of a NLP/LLM model. + All the arguments of :func:`nlpurify.normalize.strip_whitespace()` is accepted. In addition, the following are specific to this function: From 57e104115692ff9858970b2e912b902e4c48ff86 Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 23 Oct 2025 11:43:24 +0530 Subject: [PATCH 3/7] =?UTF-8?q?=F0=9F=92=A3=E2=9A=99=EF=B8=8F=E2=9C=A8=20a?= =?UTF-8?q?dd=20pydantic=20base=20abstract=20model=20to=20normalize=20whit?= =?UTF-8?q?e=20space?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - new version uses the modular approach with .apply() method to clean texts of white space - all the keyword arguments are internally processed to make the model and run the underlying function - ๐Ÿ“ƒ updated documentation of the model and function - ๐Ÿ’ฃremoved deprecated function strip_whitespace() from method - ๐Ÿ’ฃ updated init-time optimization from the module --- nlpurify/__init__.py | 12 +- nlpurify/preprocessing/__init__.py | 2 +- nlpurify/preprocessing/normalization.py | 197 ++++++++++-------------- 3 files changed, 84 insertions(+), 127 deletions(-) diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py index f273363..0579d0b 100644 --- a/nlpurify/__init__.py +++ b/nlpurify/__init__.py @@ -20,14 +20,10 @@ __version__ = "v2.1.0.dev0" # init-time options registrations +from nlpurify import preprocessing + from nlpurify.scoring import fuzzy from nlpurify.scoring import regexp -from nlpurify.feature import ( - selection as feature_selection -) - -from nlpurify.preprocessing.normalization import ( - normalize, - strip_whitespace -) +from nlpurify.feature import selection as feature_selection +from nlpurify.feature import extraction as feature_extraction diff --git a/nlpurify/preprocessing/__init__.py b/nlpurify/preprocessing/__init__.py index fbb5be6..ea100bb 100644 --- a/nlpurify/preprocessing/__init__.py +++ b/nlpurify/preprocessing/__init__.py @@ -4,4 +4,4 @@ Utility Tools to Convert Raw Texts into a Structured Format """ -from nlpurify.preprocessing.normalization import normalize +from nlpurify.preprocessing import normalization diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py index 134d7fe..41d731b 100644 --- a/nlpurify/preprocessing/normalization.py +++ b/nlpurify/preprocessing/normalization.py @@ -30,7 +30,7 @@ from pydantic import BaseModel from abc import ABC, abstractmethod -class BaseSettings(BaseModel, ABC): +class _base_normalize(BaseModel, ABC): """ Base Settings for Text Normalization with Field Validation """ @@ -40,90 +40,93 @@ def apply(self, text : str) -> str: pass -def WhiteSpace(BaseSettings): - strip : bool = True - lstrip : bool = True - rstrip : bool = True - - - def apply(self, text : str) -> str: - return text.strip() - -def strip_whitespace(text : str, **kwargs) -> str: +class WhiteSpace(_base_normalize): """ - Normalize Whitespaces in a Text Data + A Model to Normalize White Space (space, tabs, newlines) from Text Cleaning texts of white spaces like from beginning, end, and also multiple white spaces does not add any value to a text and should thus be removed to normalize the text. - :type text: str - :param text: Original string which needs to be cleaned of - white spaces. - - Keyword Arguments - ----------------- + :param strip, lstrip, rstrip: Settings to strip white spaces from + beginning or end of the string for normalization. By default, + all the spaces are removed as they do not provide any + additional information and is mostly an error in typing text. - The function now provides the following additional keyword - arguments for control: + :param newline: Strip new line characters from a multiple line + (i.e., a paragraph or text from "text area") to get one single + text, defaults to True. - * **lstrip** (*bool*): Left strip white space from the - provided text. Defaults to True. Setting any of the value - to ``False`` overrides the default ``.strip()` function. - * **rstrip** (*bool*): Right strip white space from the - provided text. Defaults to True. Setting any of the value - to ``False`` overrides the default ``.strip()` function. - * **multiple_whitespace** (*bool*): Delete multiple spaces - from the text. This uses the pattern cleaning using - regular expression. Defaults to True. + :param newlinesep: A string value which defaults to the systems' + default new line seperator ("\r\n" `CRLF` for windows, and + "\n" `LF` for *nix based systems) to replace from string. - Example(s) & Use Case(s) - ------------------------ + :param multispace: Replace multiple spaces which often reduces the + models' performance, defaults to True. - The function can be used to return a clean string of white spaces - as per user requirement: + A modular approach is now enabled which is derived from a base + normalization class. The usage is as below: .. code-block:: python - statement = " this is an example string with white space " + import nlpurify as nlpu + model = nlpu.preprocessing.normalization.WhiteSpace() - # example of default behavior - remove all abnormal spaces:: - print(f"`{nlpurify.strip_whitespace(statement)}`") - >>> `this is an example string with white space` - - # example of using either lstrip/rstrip/none as keywords - print(f"`{nlpurify.strip_whitespace(statement, lstrip = False)}`") - >>> ` this is an example string with white space` + # let's define a multi-line uncleaned text + text = ''' + This is a uncleaned text with lots of + extra white + space. + ''' - # example of setting multiple_whitespace - print(f"`{nlpurify.strip_whitespace(statement, multiple_whitespace = False)}`") - >>> `this is an example string with white space` + print(model.apply(text)) # uses default settings + >> This is a uncleaned text with lots of extra white space. - :rtype: str - :return: Return a cleaner version of string free of white - characters as per user requirement. + The model does not accept additional arguments and the function + ``.apply()`` is used to clean and normalize white space from text. """ - lstrip = kwargs.get("lstrip", True) - rstrip = kwargs.get("rstrip", True) - multiple_whitespace = kwargs.get("multiple_whitespace", True) + strip : bool = True + lstrip : bool = True + rstrip : bool = True + newline : bool = True - if all([lstrip, rstrip]): - # when both the condition is true, then default to `.strip()` - text = text.strip() - else: - # we cannot use the default strip function and should be - # handled seperately using each conditional statement - text = text.lstrip() if lstrip else text.rstrip() if rstrip else text + # ? if new line is true, then also allow to provide new line + # which defaults to the operating system default + newlinesep : str = os.linesep - # clean the text of multiple white spaces using regular expression - pattern = re.compile(r"\s+") # one or more white space character - text = pattern.sub(" ", text) if multiple_whitespace else text + # ? remove multiple whitespace - uses regual expressions + multispace : bool = True - return text + def apply(self, text : str) -> str: + pattern = re.compile(r"\s+") # one/more white spaces + + # first - strip the white space from beginning and end of text + if self.strip: + text = text.strip() + elif self.lstrip: + text = text.lstrip() + elif self.rstrip: + text = text.rstrip() + else: + pass # no strip processing + + # second, remove new line characters from the text + text = text.replace(self.newlinesep, " ") if self.newline \ + else text + + # third remove multiple white spaces from the string + text = pattern.sub(" ", text) if self.multispace else text + + return text -def normalize(text : str, strip : bool = True, **kwargs) -> str: + +def normalize( + text : str, + whitespace : bool = True, + **kwargs + ) -> str: """ The normalization function uses the in-built string function like :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner @@ -150,6 +153,13 @@ def normalize(text : str, strip : bool = True, **kwargs) -> str: which in turn uses ``pydantic`` models for data validation and settings management of each technique. + :type whitespace: bool + :param whitespace: A technique that normalizes the white space + from the underlying texts. A text with multiple white spaces + increases the processing load of a NLP/LLM model that can hurt + performance. White spaces in a text includes spaces, tabs and + new lines which is the primary delimiter of a NLP/LLM model. + Keyword Arguments ----------------- @@ -157,65 +167,16 @@ def normalize(text : str, strip : bool = True, **kwargs) -> str: normalization techniques. Each technique is associated with an underlying dictionary which is defined under respective models. - * **whitespace** (*WhiteSpace*): A normalization technique - that normalizes the white space from the underlying texts. A - text with multiple white spaces increases the processing - load of a NLP/LLM model that can hurt performance. White - spaces in a text includes spaces, tabs and new lines which - is the primary delimiter of a NLP/LLM model. - - All the arguments of :func:`nlpurify.normalize.strip_whitespace()` - is accepted. In addition, the following are specific to this - function: - - * **strip_line_breaks** (*bool*): Strip line breaks and - returns a single line statement. This uses the os default - which is either "CR LF" for windows or "LF" for *nix - based systems. However, the default value can be override - using keyword argument :attr:`line_break_seperator`. - Defaults to True. - - * **line_break_seperator** (*str*): The end line character - which is either "\\r\\n" for windows or "\\n" for *nix - based systems. By default defaults to running operating - systems default. - - * **strip_tab_space** (*bool*): Strip a line of tab character, - defaults to True. - - Example(s) & Use Case(s) - ------------------------ - - The function returns all scentence to default lower case, and - strips the text filed of white spaces and multiple lines into one - single scentence. - - .. code-block:: python - - statement = ''' - thIs Is an example string with \t\nwhite space - - loreememm ipsum dolor - - ''' - - # default behavior removes all into single statement - print(f"`{nlpurify.normalize(statement)}`") - >>> `this is an example string with white space loreememm ipsum dolor` - :rtype: str :return: Return a cleaner version of string free of white characters as per user requirement. """ - line_break_seperator = kwargs.get("line_break_seperator", os.linesep) + whitespace_model = WhiteSpace(**{ + k : kwargs.get(k, WhiteSpace.model_fields[k].default) + for k in list(WhiteSpace.model_fields.keys()) + if k in kwargs.keys() + }) - # normalize text of line breaks based on os/user defined - text = text.replace(line_break_seperator, " ") \ - if kwargs.get("strip_line_breaks", True) else text - text = text.replace(line_break_seperator, " ") \ - if kwargs.get("strip_tab_space", True) else text - - # ! ๐Ÿ’ฃ always return the text in lowercase instead of user choice - # in addition, run the white space removal logic to normalize the text - return strip_whitespace(text, **kwargs).lower() if strip else text.lower() + text = whitespace_model.apply(text) if whitespace else text + return text From 74fdcdaa949251062ffaa0500a7cb226133f855b Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 23 Oct 2025 11:56:40 +0530 Subject: [PATCH 4/7] =?UTF-8?q?=E2=9C=A8=20added=20case=20folding=20normal?= =?UTF-8?q?ization=20technique?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ๐Ÿšง documentation and field validation pending --- nlpurify/preprocessing/normalization.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py index 41d731b..83ad5af 100644 --- a/nlpurify/preprocessing/normalization.py +++ b/nlpurify/preprocessing/normalization.py @@ -122,9 +122,19 @@ def apply(self, text : str) -> str: return text +class CaseFolding(_base_normalize): + upper : bool = False + lower : bool = True + + def apply(self, text : str) -> str: + return text.upper() if self.upper else text.lower() if \ + text.lower() else text + + def normalize( text : str, whitespace : bool = True, + casefolding : bool = True, **kwargs ) -> str: """ @@ -160,6 +170,12 @@ def normalize( performance. White spaces in a text includes spaces, tabs and new lines which is the primary delimiter of a NLP/LLM model. + :type casefolding: bool + :param casefolding: Technique to normalize cases from a string to + a desired format, i.e., either all caps or all in small case. + It is always a good practice to convert all the raw text into + small case and then send for further modeling. + Keyword Arguments ----------------- @@ -178,5 +194,13 @@ def normalize( if k in kwargs.keys() }) + casefolding_model = CaseFolding(**{ + k : kwargs.get(k, CaseFolding.model_fields[k].default) + for k in list(CaseFolding.model_fields.keys()) + if k in kwargs.keys() + }) + text = whitespace_model.apply(text) if whitespace else text + text = casefolding_model.apply(text) if casefolding else text + return text From 4face8facc72f51017e2c44353b5f3e9ecfc2f5f Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 23 Oct 2025 12:21:09 +0530 Subject: [PATCH 5/7] =?UTF-8?q?=E2=9C=A8=20added=20method=20to=20normalize?= =?UTF-8?q?=20by=20removing=20stopwords?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - added extra words options to be removed, fixes #13 - word tokenization and stop words removal are now in one modular method - ๐Ÿ’ฃ this deprecates internal nlpurify/feature/selection/nltk.py methods - added attribute control to check stop words with desired case folding (upper/lower) as per final string's case folding requirements --- nlpurify/preprocessing/normalization.py | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py index 83ad5af..68f35ad 100644 --- a/nlpurify/preprocessing/normalization.py +++ b/nlpurify/preprocessing/normalization.py @@ -30,6 +30,9 @@ from pydantic import BaseModel from abc import ABC, abstractmethod +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize + class _base_normalize(BaseModel, ABC): """ Base Settings for Text Normalization with Field Validation @@ -131,10 +134,39 @@ def apply(self, text : str) -> str: text.lower() else text +class StopWords(_base_normalize): + language : str = "english" + extrawords : list = [] + + # ! by default, nltk library provides stopwords in lower case + # however, we can override and set the value as per our case needs + stopwords_in_uppercase : bool = False + + # ! removal of stop words is associated with word tokenization + tokenize : bool = True + + + def apply(self, text : str) -> str: + stopwords_ = stopwords.words(self.language) + self.extrawords + tokenized_ = word_tokenize( + text, language = self.language, preserve_line = False + ) if self.tokenize else text.split() + + # case folding of stopwords in upper/lower case as per need + stopwords_ = list(map( + lambda x : x.upper(), stopwords_ + )) if self.stopwords_in_uppercase else stopwords_ + + return " ".join([ + word for word in tokenized_ if word not in stopwords_ + ]) + + def normalize( text : str, whitespace : bool = True, casefolding : bool = True, + stopwords : bool = True, **kwargs ) -> str: """ @@ -200,7 +232,14 @@ def normalize( if k in kwargs.keys() }) + stopwords_model = StopWords(**{ + k : kwargs.get(k, StopWords.model_fields[k].default) + for k in list(StopWords.model_fields.keys()) + if k in kwargs.keys() + }) + text = whitespace_model.apply(text) if whitespace else text text = casefolding_model.apply(text) if casefolding else text + text = stopwords_model.apply(text) if stopwords else text return text From de50c94bdb31bfa265b62b412919cc536bb05456 Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 23 Oct 2025 14:17:29 +0530 Subject: [PATCH 6/7] =?UTF-8?q?=F0=9F=92=A3=20refactor=20feature/selection?= =?UTF-8?q?=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nlpurify/feature/selection/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nlpurify/feature/selection/__init__.py b/nlpurify/feature/selection/__init__.py index 4188f03..7c073e9 100644 --- a/nlpurify/feature/selection/__init__.py +++ b/nlpurify/feature/selection/__init__.py @@ -3,5 +3,3 @@ """ Selection of Finite Set of Features/Tokens for Efficient Modelling """ - -from nlpurify.feature.selection.nltk import * # noqa: F401, F403 # pyright: ignore[reportMissingImports] From 88d5b8f5fafd1362066a2845e274938a36d48af2 Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 23 Oct 2025 15:21:39 +0530 Subject: [PATCH 7/7] =?UTF-8?q?=E2=9C=A8=F0=9F=92=A3=20refactor=20word=20t?= =?UTF-8?q?okenization=20in=20nlpurify.preprocessing.utils?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - added example in jupyter notebooks - added preprocessing utility methods - modularize word tokenization in stop words selection --- .../Text Normalization using NLPurify.ipynb | 268 +++++++++++++++ nlpurify/feature/selection/nltk.py | 313 ------------------ nlpurify/preprocessing/__init__.py | 1 + nlpurify/preprocessing/normalization.py | 9 +- nlpurify/preprocessing/utils.py | 78 +++++ 5 files changed, 352 insertions(+), 317 deletions(-) create mode 100644 examples/Text Normalization using NLPurify.ipynb delete mode 100644 nlpurify/feature/selection/nltk.py create mode 100644 nlpurify/preprocessing/utils.py diff --git a/examples/Text Normalization using NLPurify.ipynb b/examples/Text Normalization using NLPurify.ipynb new file mode 100644 index 0000000..df319b5 --- /dev/null +++ b/examples/Text Normalization using NLPurify.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4a37fef", + "metadata": {}, + "source": [ + "

Text Normalization

\n", + "\n", + "---\n", + "In the world of Natural Language Processing (NLP), we work with human language. However, human language is inherently messy, varied, and full of nuances that can be confusing for computers. Text normalization is the foundational process of cleaning and standardizing raw text into a consistent, predictable format. Think of it as tidying up a chaotic room before you can find anything; we are tidying up language so a machine learning model can understand it.\n", + "\n", + "The primary goal is to reduce the randomness in text by grouping different variations of a word or phrase into a single, canonical form. For example, to a computer, the words \"run,\" \"Run,\" and \"running\" are three distinct items. Text normalization ensures these are all recognized as the same core concept, simplifying the data for NLP models. This preprocessing step is crucial for the success of almost all major NLP tasks, including search engines, sentiment analysis, and machine translation.\n", + "\n", + "**Why is it so Important?**\n", + "\n", + " * **Improved Model Performance:** Clean, standardized data helps models learn more effectively, leading to higher accuracy.\n", + " * **Reduced Complexity:** It significantly shrinks the vocabulary the model needs to learn, which reduces computational costs and memory usage.\n", + " * **Enhanced Feature Extraction:** When different forms of a word are treated as a single feature, the statistical power of that feature increases, leading to better insights." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "526847b9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:13.253923Z", + "start_time": "2025-10-23T09:26:13.230466Z" + } + }, + "outputs": [], + "source": [ + "import os # miscellaneous os interfaces\n", + "import sys # configuring python runtime environment" + ] + }, + { + "cell_type": "markdown", + "id": "e7157cc7", + "metadata": {}, + "source": [ + "## NLP Libraries\n", + "\n", + "Python offers a rich ecosystem of libraries for Natural Language Processing (NLP), catering to various needs from foundational tasks to advanced deep learning models. Here are some of the most prominent ones:\n", + "\n", + " 1. [NLTK](https://www.nltk.org/) Natural Language Toolkit - a comprehensive library for foundational NLP tasks like tokenization, stemming, lemmatization, etc.\n", + " 2. [spaCy](https://spacy.io/) Industrial-Strength NLP - designed for production-level applications, emphasizing speed and efficiency." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f6f328a9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:13.269218Z", + "start_time": "2025-10-23T09:26:13.255910Z" + } + }, + "outputs": [], + "source": [ + "# import nltk" + ] + }, + { + "cell_type": "markdown", + "id": "4ab6f85c", + "metadata": {}, + "source": [ + "### NLPurify\n", + "\n", + "A text cleaning and extraction engine was developed using a combination of traditional techniques like Unicode translations, cleaning using regular expressions, and modern tools like \"natural language processing\"\n", + "and \"large language models\" to detect and clean long texts and create word vectors. The library is developed as an one-stop solution that modifies and collates the utility functions to provide common things at one place." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7afba9c9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:26:20.019613Z", + "start_time": "2025-10-23T09:26:13.274467Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Version: v2.1.0.dev0\n" + ] + } + ], + "source": [ + "import nlpurify as nlpu\n", + "\n", + "# general convention is to assign the short form ``nlpu`` to the library\n", + "# print the current version of the library - for debugging and documentation\n", + "print(f\"Current Version: {nlpu.__version__}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4e3479e9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:27:21.225941Z", + "start_time": "2025-10-23T09:27:21.218991Z" + } + }, + "outputs": [], + "source": [ + "text = '''\n", + " This is a uncLeaneD text with lots of\n", + " extra WHITE \n", + "space.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e4d2fc66", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:37:55.741806Z", + "start_time": "2025-10-23T09:37:55.722247Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized White Space: `This is a uncLeaneD text with lots of extra WHITE space.`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.WhiteSpace()\n", + "print(f\"Normalized White Space: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d405780a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:42:40.080806Z", + "start_time": "2025-10-23T09:42:40.061140Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `\n", + " this is a uncleaned text with lots of\n", + " extra white \n", + "space.\n", + "`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.CaseFolding()\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "9f4aa5e6", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:21.586061Z", + "start_time": "2025-10-23T09:46:21.572034Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `This uncLeaneD text lots extra WHITE space .`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.normalization.StopWords()\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ca00225a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:22.408174Z", + "start_time": "2025-10-23T09:46:22.400677Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uniform Case Folding: `['This', 'is', 'a', 'uncLeaneD', 'text', 'with', 'lots', 'extra', 'WHITE']`\n" + ] + } + ], + "source": [ + "model = nlpu.preprocessing.utils.WordTokenize(vanilla = True, tokenizer = False, vanilla_getalnum = True)\n", + "print(f\"Uniform Case Folding: `{model.apply(text)}`\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0a2d28cf", + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-23T09:46:24.121050Z", + "start_time": "2025-10-23T09:46:24.101463Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UNCLEANED TEXT LOTS EXTRA WHITE SPACE .\n" + ] + } + ], + "source": [ + "print(nlpu.preprocessing.normalization.normalize(text, upper = True, stopwords_in_uppercase = True))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TensorFlow CPU (v2.12.0)", + "language": "python", + "name": "tensorflow" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py deleted file mode 100644 index 4c8299e..0000000 --- a/nlpurify/feature/selection/nltk.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- encoding: utf-8 -*- - -""" -A Collection of Methods from the Natural Language Toolkit (NLTK) - -NLTK is a leading suite of library designed for symbolic and -statistical natural language program in Python. This module is -designed for the English language, however, the due to continued -effort from the community other languages are being incorporated. - -A set of functions are designed using the NLTK library for feature -selection methods like stop words removal, word lemmatizations etc. -A feature selection method works best when a text is normalized. -This can be achived by using :mod:`nlpurify.normalizeText()` method -and is generally internally by all the related functions defined. -""" - -import re - -from typing import Union - -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize - -from nlpurify.preprocessing.normalization import normalize - -def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **kwargs) -> list: - """ - Tokenization of Text into Lists of Substrings - - A word vector is one where a scentence is broken down into small - pieces (or vectors) which constitutes the features of an model. - To achieve tokenization, the most simpler approach is by using the - in-built string method ``text.split()`` which splits the string - by white characters. However, this is often in efficient and - results in an improper model development. This can be resolved by - using the :mod:`nltk.tokenize` methods. - - By default, the function is tuned with the ``word_tokenize`` - method which works as below: - - .. code-block:: python - - from nltk.tokenize import word_tokenize - s = '''Good muffins cost $3.88\\nin New York. Please buy me - two of them.\\n\\nThanks.''' - - word_tokenize(s) - >> ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', - 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', - '.', 'Thanks', '.'] - - The power of tokenization is appreciated even more when the text - data is cleaned and normalized as defined in the function - :func:`nlpurify.feature.selection.nltk.remove_stopwords`. - - :type text: str - :param text: The raw text which is internally normalized using - :func:`nlpurify.normalizeText()` for feature selection. To - stop normalization of text see the keyword arguments section. - - :type vanilla: bool - :param vanilla: Override the :func:`nltk.tokenize.word_tokenize` - function with Python vanilla method. Either the method allows - setting ``regexp == True`` or ``vanilla == True`` while both - are not allowed. The vanilla method uses string attributes - like ``.split()`` and other keyword arguments to control - and tokenize the data. - - :type regexp: bool - :param regexp: Override the :func:`nltk.tokenize.word_tokenize` - function with a regular expressions. Either the method allows - setting ``regexp == True`` or ``vanilla == True`` while both - are not allowed. - - **Keyword Arguments** - - The default keyword arguments are defined for the - :func:`nltk.tokenize.word_tokenize` function. - - * **preserve_line** (*bool*): A flag to decide whether to - sentence tokenize the text or not, as accepted by - the function. Defaults to False. - - * **tokenize_language** (*str*): The language model name as - accepted by the Punkt corpus by NLTK. Defaults to the - "english" language, as in function. - - The paramter value associated with regular expression data - control is as below: - - * **expression** (*str*): The regular expression which - is used to compile the regular expression. This should - be a r-string which can be directly used. Defaults to - ``r"\w+"` value, i.e. only word characters. - - The paramter value associated with the Python vanilla method - control is as below: - - * **split_by** (*str*): The value is passed to :func:`split()` - to control seperated terms, defaults to white space. - - * **retalpha** (*bool*): If set to True (default) returns only - alphabets which is an inherent string property: ``isalpha`` - method. If both ``retalpha`` and ``retalnum`` is true, then - ``retalpha`` has an overriding effect. - - * **retalnum** (*bool*): If set to True (default) returns only - alphabets and numeric characters from the string which uses - an inherent string property: ``isalnum`` method. - - **Function Example** - - The function primarily returns a list of strings which can be - used to create word vector. - - .. code-block:: python - - s = "this is an example string, with p()nct & n0s." - - # using the default word_tokenize: - print(nlpurify.feature_selection.tokenize_text(s)) - >> ['this', 'is', 'an', 'example', 'string', ',', 'with', 'p', '(', ')', 'nct', '&', 'n0s', '.'] - - # using regular expressions, default configuration - print(nlpurify.feature_selection.tokenize_text(s, regexp = True)) - >> ['this', 'is', 'an', 'example', 'string', 'with', 'p', 'nct', 'n0s'] - - # a custom regular expressions is also accepted, feel free to experiment! - # the following expression mimics the above example r"\w+" but using custom expression - print(nlpurify.feature_selection.tokenize_text(s, regexp = True, expression = r"[a-zA-Z0-9_]+")) - >> ['this', 'is', 'an', 'example', 'string', 'with', 'p', 'nct', 'n0s'] - - # this is using vanilla python string functions with default values - print(nlpurify.feature_selection.tokenize_text(s, vanilla = True)) - >> ['this', 'is', 'an', 'example', 'with'] - - # to understand the difference using retalpha == False, i.e., retalnum = True - s = "this is an example string, with p()nct & n0s. 987" - print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False)) - >> ['this', 'is', 'an', 'example', 'with', '987'] - - **Error Guidelines** - - :raises ValueError: The error is raised when both the attribute - ``vanilla`` and ``regexp`` is set to True. - - :raises ImportError: Error is raised when one or more nltk corpus - is not available in the system. - - **Return Type** - - :rtype: list[str] - :return: Returns a tokenized list of strings. To represent and - save the same in a tabular format use ``"".join()`` method. - """ - - preserve_line = kwargs.get("preserve_line", False) - tokenize_language = kwargs.get("tokenize_language", "english") - - expression = re.compile(kwargs.get("expression", r"\w+")) - - split_by = kwargs.get("split_by", " ") - retalpha = kwargs.get("retalpha", True) - retalnum = kwargs.get("retalnum", True) - - tokenize_method = None - # ! do not allow both `regexp` and `vanilla` as true - if all([regexp, vanilla]): - raise ValueError("Both Control are Not Allowed.") - elif any([regexp, vanilla]): - # ? if the error is not raised, execute below - else section - if regexp: - tokenize_method = "regexp" - else: - # ? vanilla method is selected, however based - # on keyword selection - setting different index - if retalpha: - # this has an ovverriding effect thus first - tokenize_method = "vanilla-alnum" - elif retalnum: - tokenize_method = "vanilla-alpha" - else: - tokenize_method = "vanilla-vanilla" - else: - tokenize_method = "word-tokenize" - - # select method and parameter control, and finally - # return token data value from the choice of method: - tokens = { - "regexp" : expression.findall(text), - - # we've the three methods of vanilla like: - "vanilla-alnum" : [s for s in text.split(split_by) if s.isalnum()], - "vanilla-alpha" : [s for s in text.split(split_by) if s.isalpha()], - "vanilla-vanilla" : text.split(split_by), # default, no change - - "word-tokenize" : word_tokenize(text, language = tokenize_language, preserve_line = preserve_line) - } - - return tokens[tokenize_method] - - -def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> str | list: - """ - Function to Remove Stopwods from a Raw Text using NLTK - - In a Natural Language Processing (NLP), stopwords are frequently - removed to improve the performance of the model and increase the - computational efficiency. Words like "and", "or", etc. does not - provide additional information to the model but are important for - communications. - - The NLTK library hosts a lists of words that should be removed. - To find the list of supported language, check the fields: - - .. code-block:: python - - from nltk.corpus import stopwords - print(stopwords.fileids()) # list of supported language - - Stopwords is available under :mod:`nltk.corpus` which is required - by the function. To download a corpus: - - .. code-block:: python - - import nltk - nltk.download("all") # download all the available corpus - nltk.download("stopwords") # only download stopwrds corpus - - While it is advisable to download all the corpus, but it is - left on the discretion of the user. The function may throw an - error in case there are additional dependent corpus which needs - to be downloaded. - - :type text: str - :param text: The raw text which is internally normalized using - :func:`nlpurify.normalizeText()` for feature selection. To - stop normalization of text see the keyword arguments section. - - :type language: str - :param language: The name of the language which is available - under the :mod:`nltk.corpus.stopwords`. To find the list - of accepted languages check ``stopwords.fields()``. Defaults - to the "english" language. - - **Keyword Arguments** - - * **tokenize** (*bool*): Tokenize the text using the - :mod:`nltk.tokenize.word_tokenize()` method to extract the - tokens from the strings which returns the syllables from - a single word. Defaults to True. If false, the function - internally creates a token vector using ``text.split()`` - method, which splits the words by spaces - which may - have unwanted effect when symbols are present in the text. - Check attributes of :func:`tokenize()` for more details. - - * **normalize** (*bool*): Normalize the text internally. - Defaults to true, else if the data is already normalized - then pass False. Note the function may not be able to - remove all the stopwords if the data is not normalized and - the case of the words is not lower. - - **Function Example** - - For more control over the tokenization, all the parameters - of :func:`tokenize_text()` is accepted. - - .. code-block:: python - - s = "this is an example string, with p()nct & n0s." - - # using defaults from tokenize_text, i.e., using word_tokenize - print(nlpurify.feature_selection.remove_stopwords(s)) - >> example string , p ( ) nct & n0s . - - # this we can further simplify by using other features - print(nlpurify.feature_selection.remove_stopwords(s, regexp = True)) - >> example string p nct n0s - - **Error Guidelines** - - :raises ValueError: The error is raised when the return type is - not in {str, list} values. Make sure the data type is an type - instance and is not passed as a string value. - - :raises ImportError: Error is raised when one or more nltk corpus - is not available in the system. - - **Return Type** - - :rtype: str | list - :return: A cleaned string or a vector (*iterable*) of selected - features from a given text message. - """ - - tokenize_ = kwargs.get("tokenize", True) - normalize_ = kwargs.get("normalize", True) - - stopwords_ = stopwords.words(language) # defaults to english - - # ? normalize the text using nlpurify.normalizeText() - # else, left at user's discreations or additional functionalities - text = normalize(text, **kwargs) if normalize_ else text - - tokens = tokenize_text(text, **kwargs) if tokenize_ else text - tokens = [word for word in tokens if word not in stopwords_] - - # ensure return type of the data, else raise error - if rtype not in [str, list]: - raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.") - - return " ".join(tokens) if rtype == str else tokens diff --git a/nlpurify/preprocessing/__init__.py b/nlpurify/preprocessing/__init__.py index ea100bb..be0a62c 100644 --- a/nlpurify/preprocessing/__init__.py +++ b/nlpurify/preprocessing/__init__.py @@ -4,4 +4,5 @@ Utility Tools to Convert Raw Texts into a Structured Format """ +from nlpurify.preprocessing import utils from nlpurify.preprocessing import normalization diff --git a/nlpurify/preprocessing/normalization.py b/nlpurify/preprocessing/normalization.py index 68f35ad..9c98dbe 100644 --- a/nlpurify/preprocessing/normalization.py +++ b/nlpurify/preprocessing/normalization.py @@ -31,7 +31,8 @@ from abc import ABC, abstractmethod from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize + +from nlpurify.preprocessing.utils import WordTokenize class _base_normalize(BaseModel, ABC): """ @@ -144,13 +145,13 @@ class StopWords(_base_normalize): # ! removal of stop words is associated with word tokenization tokenize : bool = True + tokenize_config : WordTokenize = WordTokenize() def apply(self, text : str) -> str: stopwords_ = stopwords.words(self.language) + self.extrawords - tokenized_ = word_tokenize( - text, language = self.language, preserve_line = False - ) if self.tokenize else text.split() + tokenized_ = self.tokenize_config.apply(text) \ + if self.tokenize else text.split() # case folding of stopwords in upper/lower case as per need stopwords_ = list(map( diff --git a/nlpurify/preprocessing/utils.py b/nlpurify/preprocessing/utils.py new file mode 100644 index 0000000..0231093 --- /dev/null +++ b/nlpurify/preprocessing/utils.py @@ -0,0 +1,78 @@ +# -*- encoding: utf-8 -*- + +""" +Utility Functions for Text Preprocessings +""" + +import re + +from pydantic import BaseModel +from nltk.tokenize import word_tokenize + +class WordTokenize(BaseModel): + """ + Tokenize text into word vectors using different types of methods + to achieve cleaner text in desired formats. + + :param regexp, vanilla, tokenizer: Selection methods for different + tokenization techniques. Set the value to ``regexp = True`` to + tokenize text using regular expressions, for using pure Python + based text tokenization use the ``vanilla = True`` method, and + ``tokenizer = True`` (default) is for using external tokenizer + functions like :func:`nltk.tokenize.word_tokenize` methods. + The function will throw error if all of the values are set to + true, and only one can be true at a time. + """ + + regexp : bool = False + vanilla : bool = False + tokenizer : bool = True + + # ? additional settings for regular expressions + regexp_pattern : str = r"\w+" + + # ? additional settings for vanilla based methods + vanilla_split_by : str = " " + vanilla_getalpha : bool = False + vanilla_getalnum : bool = False + + # ? additional settings for tokenizer based method + tokenizer_language : str = "english" + tokenizer_preserve_line : bool = False + + + def apply(self, text : str) -> str: + method = "regexp" if self.regexp else "vanilla" \ + if self.vanilla else "tokenizer" if self.tokenizer \ + else None # none should not be generated, validated values + + if method == "regexp": + expression = re.compile(self.regexp_pattern) + text = expression.findall(text) + elif method == "vanilla": + submethod = "retalpha" if self.vanilla_getalpha \ + else "retalnum" if self.vanilla_getalnum else None + + _functions = { + "retalpha" : [ + token for token in text.split(self.vanilla_split_by) + if token.isalpha() + ], + + "retalnum" : [ + token for token in text.split(self.vanilla_split_by) + if token.isalnum() + ] + } + + text = _functions.get(submethod, text) + elif method == "tokenizer": + text = word_tokenize( + text, + language = self.tokenizer_language, + preserve_line = self.tokenizer_preserve_line + ) + else: + pass + + return text