diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f5d67dc..6599cdd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -8,25 +8,25 @@ on: jobs: build: name: Python ${{ matrix.python-version }} - runs-on: ubuntu-20.04 + # https://stackoverflow.com/questions/70959954/error-waiting-for-a-runner-to-pick-up-this-job-using-github-actions + runs-on: ubuntu-latest strategy: matrix: python-version: - - "3.6" - - "3.9" - "3.9" - "3.11" - "3.12" + - "3.13" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v5 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - uses: actions/cache@v2 + - uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('requirements/dev.txt') }} diff --git a/README.md b/README.md index c5eeed9..c774071 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,9 @@ [![CI](https://github.com/remigermain/nested-multipart-parser/actions/workflows/main.yml/badge.svg)](https://github.com/remigermain/nested-multipart-parser/actions/workflows/main.yml) [![pypi](https://img.shields.io/pypi/v/nested-multipart-parser)](https://pypi.org/project/nested-multipart-parser/) -![PyPI - Downloads](https://img.shields.io/pypi/dm/Nested-multipart-parser) - -Parser for nested data for '*multipart/form*', you can use it in any python project, or use the Django Rest Framework integration. +[![PyPI - Downloads](https://img.shields.io/pypi/dm/Nested-multipart-parser)](https://pypistats.org/packages/nested-multipart-parser) +Parser for nested data for *multipart/form*, usable in any Python project or via the [Django Rest Framework integration](https://www.django-rest-framework.org/community/third-party-packages/#parsers).. # Installation: ```bash @@ -35,6 +34,16 @@ def my_view(): ### Django Rest Framework +you can define parser for all view in settings.py +```python +REST_FRAMEWORK = { + "DEFAULT_PARSER_CLASSES": [ + "nested_multipart_parser.drf.DrfNestedParser", + ] +} +``` +or directly in your view + ```python from nested_multipart_parser.drf import DrfNestedParser ... @@ -46,7 +55,7 @@ class YourViewSet(viewsets.ViewSet): ## What it does: -The parser take the request data and transform it to a Python dictionary: +The parser takes the request data and transforms it into a Python dictionary. example: @@ -94,126 +103,106 @@ example: } ``` -## How it works: - -Attributes where sub keys are full numbers only are automatically converted into lists: +## How it works +### Lists +Attributes whose sub‑keys are *only numbers* become Python lists: ```python - data = { - 'title[0]': 'my-value', - 'title[1]': 'my-second-value' - } - output = { - 'title': [ - 'my-value', - 'my-second-value' - ] - } - - # Be aware of the fact that you have to respect the order of the indices for arrays, thus - 'title[2]': 'my-value' # Invalid (you have to set title[0] and title[1] before) - - # Also, you can't create an array on a key already set as a prinitive value (int, boolean or string): - 'title': 42, - 'title[object]': 42 # Invalid +data = { + 'title[0]': 'my-value', + 'title[1]': 'my-second-value' +} +output = { + 'title': [ + 'my-value', + 'my-second-value' + ] +} ``` +> Important notes - - -Attributes where sub keys are other than full numbers are converted into Python dictionary: - +- Indices must be contiguous and start at 0. +- You cannot turn a primitive (int, bool, str) into a list later, e.g. ```python - data = { - 'title.key0': 'my-value', - 'title.key7': 'my-second-value' - } - output = { - 'title': { - 'key0': 'my-value', - 'key7': 'my-second-value' - } - } - - - # You have no limit for chained key: - # with "mixed-dot" separator option (same as 'mixed' but with dot after list to object): - data = { - 'the[0].chained.key[0].are.awesome[0][0]': 'im here !!' - } - # with "mixed" separator option: - data = { - 'the[0]chained.key[0]are.awesome[0][0]': 'im here !!' - } - # With "bracket" separator option: - data = { - 'the[0][chained][key][0][are][awesome][0][0]': 'im here !!' - } - # With "dot" separator option: - data = { - 'the.0.chained.key.0.are.awesome.0.0': 'im here !!' - } + 'title': 42, + 'title[object]': 42 # ❌ invalid ``` +### Dictionaries +Attributes whose sub‑keys are *not pure numbers* become nested dictionaries: +```python +data = { + 'title.key0': 'my-value', + 'title.key7': 'my-second-value' +} +output = { + 'title': { + 'key0': 'my-value', + 'key7': 'my-second-value' + } +} +``` -For this to work perfectly, you must follow the following rules: - -- A first key always need to be set. ex: `title[0]` or `title`. In both cases the first key is `title` - -- For `mixed` or `mixed-dot` options, brackets `[]` is for list, and dot `.` is for object +### Chaining keys -- For `mixed-dot` options is look like `mixed` but with dot when object follow list +>Keys can be chained arbitrarily. Below are examples for each separator option: -- For `bracket` each sub key need to be separate by brackets `[ ]` or with `dot` options `.` +|Separator| Example key | Meaning| +|-|-|-| +|mixed‑dot| the[0].chained.key[0].are.awesome[0][0] |List → object → list → object …| +|mixed| the[0]chained.key[0]are.awesome[0][0] | Same as mixed‑dot but without the dot after a list| +|bracket| the[0][chained][key][0][are][awesome][0][0] | Every sub‑key is wrapped in brackets| +|dot |the.0.chained.key.0.are.awesome.0.0 | Dots separate every level; numeric parts become lists| -- For `bracket` or `dot`options, if a key is number is convert to list else a object -- Don't put spaces between separators. +Rules to keep in mind +- First key must exist – e.g. title[0] or just title. +- For mixed / mixed‑dot, [] denotes a list and . denotes an object. +- mixed‑dot behaves like mixed but inserts a dot when an object follows a list. +- For bracket, each sub‑key must be surrounded by brackets ([ ]). +- For bracket or dot, numeric sub‑keys become list elements; non‑numeric become objects. +- No spaces between separators. +- By default, duplicate keys are disallowed (see options). +- Empty structures are supported: + Empty list → "article.authors[]": None → {"article": {"authors": []}} + Empty dict → "article.": None → {"article": {}} (available with dot, mixed, mixed‑dot) -- By default, you can't set set duplicates keys (see options) -- You can set empty dict/list: - for empty list: `"article.authors[]": None` -> `{"article": {"authors": [] }}` - for empty dict: `"article.": None` -> `{"article": {} }` - `.` last dot for empty dict (availables in `dot`, `mixed` and `mixed-dot` options) - `[]` brackets empty for empty list (availables in `brackets`, `mixed` and `mixed-dot` options) - ## Options ```python { - # Separators: - # with mixed-dot: article[0].title.authors[0]: "jhon doe" - # with mixed: article[0]title.authors[0]: "jhon doe" - # with bracket: article[0][title][authors][0]: "jhon doe" - # with dot: article.0.title.authors.0: "jhon doe" - 'separator': 'bracket' or 'dot' or 'mixed' or 'mixed-dot', # default is `mixed-dot` - - - # raise a expections when you have duplicate keys - # ex : - # { - # "article": 42, - # "article[title]": 42, - # } - 'raise_duplicate': True, # default is True - - # override the duplicate keys, you need to set "raise_duplicate" to False - # ex : - # { - # "article": 42, - # "article[title]": 42, - # } - # the out is - # ex : - # { - # "article"{ - # "title": 42, - # } - # } - 'assign_duplicate': False # default is False + # Separator (default: 'mixed‑dot') + # mixed‑dot : article[0].title.authors[0] -> "john doe" + # mixed : article[0]title.authors[0] -> "john doe" + # bracket : article[0][title][authors][0] -> "john doe" + # dot : article.0.title.authors.0 -> "john doe" + 'separator': 'bracket' | 'dot' | 'mixed' | 'mixed‑dot', + + # Raise an exception when duplicate keys are encountered + # Example: + # { + # "article": 42, + # "article[title]": 42, + # } + 'raise_duplicate': True, # default: True + + # Override duplicate keys (requires raise_duplicate=False) + # Example: + # { + # "article": 42, + # "article[title]": 42, + # } + # Result: + # { + # "article": { + # "title": 42 + # } + # } + 'assign_duplicate': False, # default: False } ``` @@ -223,20 +212,20 @@ For this to work perfectly, you must follow the following rules: # settings.py ... +# settings.py DRF_NESTED_MULTIPART_PARSER = { - "separator": "mixed-dot", - "raise_duplicate": True, - "assign_duplicate": False, + "separator": "mixed‑dot", + "raise_duplicate": True, + "assign_duplicate": False, - # output of parser is converted to querydict - # if is set to False, dict python is returned - "querydict": True, + # If True, the parser’s output is converted to a QueryDict; + # if False, a plain Python dict is returned. + "querydict": True, } ``` ## JavaScript integration: - -You can use this [multipart-object](https://github.com/remigermain/multipart-object) library to easy convert object to flat nested object formatted for this library +A companion [multipart-object](https://github.com/remigermain/multipart-object) library exists to convert a JavaScript object into the flat, nested format expected by this parser. ## License diff --git a/bench/bench.py b/bench/bench.py new file mode 100644 index 0000000..addef26 --- /dev/null +++ b/bench/bench.py @@ -0,0 +1,77 @@ +import time + +from nested_multipart_parser import NestedParser + + +def bench(data, count): + v = [] + for _ in range(count): + start = time.perf_counter() + parser = NestedParser(data) + parser.is_valid() + validate_data = parser.validate_data + end = time.perf_counter() + v.append(end - start) + + return sum(v) / len(v) + + +def big(count): + data = { + "title": "title", + "date": "time", + "langs[0].id": "id", + "langs[0].title": "title", + "langs[0].description": "description", + "langs[0].language": "language", + "langs[1].id": "id1", + "langs[1].title": "title1", + "langs[1].description": "description1", + "langs[1].language": "language1", + "test.langs[0].id": "id", + "test.langs[0].title": "title", + "test.langs[0].description": "description", + "test.langs[0].language": "language", + "test.langs[1].id": "id1", + "test.langs[1].title": "title1", + "test.langs[1].description": "description1", + "test.langs[1].language": "language1", + "deep.nested.dict.test.langs[0].id": "id", + "deep.nested.dict.test.langs[0].title": "title", + "deep.nested.dict.test.langs[0].description": "description", + "deep.nested.dict.test.langs[0].language": "language", + "deep.nested.dict.test.langs[1].id": "id1", + "deep.nested.dict.test.langs[1].title": "title1", + "deep.nested.dict.test.langs[1].description": "description1", + "deep.nested.dict.test.langs[1].language": "language1", + "deep.nested.dict.with.list[0].test.langs[0].id": "id", + "deep.nested.dict.with.list[0].test.langs[0].title": "title", + "deep.nested.dict.with.list[1].test.langs[0].description": "description", + "deep.nested.dict.with.list[1].test.langs[0].language": "language", + "deep.nested.dict.with.list[1].test.langs[1].id": "id1", + "deep.nested.dict.with.list[1].test.langs[1].title": "title1", + "deep.nested.dict.with.list[0].test.langs[1].description": "description1", + "deep.nested.dict.with.list[0].test.langs[1].language": "language1", + } + return bench(data, count) + + +def small(count): + data = { + "title": "title", + "date": "time", + "langs[0].id": "id", + "langs[0].title": "title", + "langs[0].description": "description", + "langs[0].language": "language", + "langs[1].id": "id1", + "langs[1].title": "title1", + "langs[1].description": "description1", + "langs[1].language": "language1", + } + return bench(data, count) + + +count = 10_000 +print(f"{small(count)=}") +print(f"{big(count)=}") diff --git a/nested_multipart_parser/declare.py b/nested_multipart_parser/declare.py deleted file mode 100644 index db2c95a..0000000 --- a/nested_multipart_parser/declare.py +++ /dev/null @@ -1,73 +0,0 @@ -class NestedDeclare: - """Create ditc/list wihout order""" - - def __init__(self, _type=None, options=None): - self._elements = {} - self._options = options or {} - self.set_type(_type) - - def __repr__(self): - return f"{type(self).__name__}({self._type.__name__})" - - def set_type(self, _type): - self._type = _type - self._is_dict = _type is dict - self._is_list = _type is list - self._is_none = _type is None - - def get_type(self): - return self._type - - def set_type_from_key(self, key): - self.set_type(list if isinstance(key, int) else dict) - - def conv_value(self, value): - if isinstance(value, type(self)): - value = value.convert() - return value - - def __setitem__(self, key, value): - if self._is_none: - self.set_type_from_key(key) - if isinstance(key, int) and not self._is_list: - raise ValueError("int key cant be integer for dict object") - if not isinstance(key, int) and self._is_list: - raise ValueError("need integer key for list elements") - - if key in self._elements: - if ( - isinstance(value, type(self)) - and isinstance(self._elements[key], type(self)) - and self._elements[key].get_type() == value.get_type() - ): - return - - if self._options.get("raise_duplicate"): - raise ValueError("key is already set") - - if not self._options.get("assign_duplicate"): - return - - self._elements[key] = value - - def __getitem__(self, key): - if key not in self._elements: - self[key] = type(self)(options=self._options) - return self._elements[key] - - def _convert_list(self): - keys = sorted(self._elements.keys()) - if keys != list(range(len(keys))): - raise ValueError("invalid format list keys") - - return [self.conv_value(self._elements[key]) for key in keys] - - def _convert_dict(self): - return {key: self.conv_value(value) for key, value in self._elements.items()} - - def convert(self): - if self._is_none: - return None - if self._is_list: - return self._convert_list() - return self._convert_dict() diff --git a/nested_multipart_parser/options.py b/nested_multipart_parser/options.py index 6a6e6d5..0047180 100644 --- a/nested_multipart_parser/options.py +++ b/nested_multipart_parser/options.py @@ -1,5 +1,16 @@ import re +# compatibilty python < 3.9 +try: + from functools import cache +except ImportError: + from functools import lru_cache as cache + + +@cache +def cache_regex_compile(*ar, **kw): + return re.compile(*ar, **kw) + class InvalidFormat(Exception): """key is invalid formated""" @@ -16,7 +27,7 @@ def __new__(cls, cls_name, ns, childs): return super().__new__(cls, cls_name, ns, childs) -TOKEN_PARSER = ("[", "]", ".") +INVALID_TOKEN_PARSER = ("[", "]", ".") class NestedParserOptionsAbstract(metaclass=NestedParserOptionsType): @@ -25,7 +36,7 @@ def check(self, key, keys): raise InvalidFormat(key) first = keys[0] - for token in TOKEN_PARSER: + for token in INVALID_TOKEN_PARSER: if token in first: raise InvalidFormat(key) @@ -35,9 +46,9 @@ def check(self, key, keys): for c in key: if c.isspace(): raise InvalidFormat(key) - + def split(self, key): - contents = list(filter(None, self._reg_spliter.split(key))) + contents = [v for v in self._reg_spliter.split(key) if v] if not contents: raise ValueError(f"invalid form key: {key}") @@ -47,21 +58,13 @@ def split(self, key): if len(contents) == 3: lst.append(contents[2]) - return list(filter(None, lst)) - - -REGEX_SEPARATOR = { - "dot": r"(\.[^\.]+)", - "bracket": r"([^\[\]]+)", - "mixed": r"(\[\d+\])|([^\[\]]+)", - "mixed-dot": r"(\[\d+\])|(\.[^\[\]\.]+)", -} + return [v for v in lst if v] class NestedParserOptionsDot(NestedParserOptionsAbstract): def __init__(self): - self._reg_spliter = re.compile(r"^([^\.]+)(.*?)(\.)?$") - self._reg_options = re.compile(r"(\.[^\.]+)") + self._reg_spliter = cache_regex_compile(r"^([^\.]+)(.*?)(\.)?$") + self._reg_options = cache_regex_compile(r"(\.[^\.]+)") def sanitize(self, key, value): contents = self.split(key) @@ -88,8 +91,8 @@ def sanitize(self, key, value): class NestedParserOptionsBracket(NestedParserOptionsAbstract): def __init__(self): - self._reg_spliter = re.compile(r"^([^\[\]]+)(.*?)(\[\])?$") - self._reg_options = re.compile(r"(\[[^\[\]]+\])") + self._reg_spliter = cache_regex_compile(r"^([^\[\]]+)(.*?)(\[\])?$") + self._reg_options = cache_regex_compile(r"(\[[^\[\]]+\])") def sanitize(self, key, value): first, *lst = self.split(key) @@ -117,8 +120,10 @@ def sanitize(self, key, value): class NestedParserOptionsMixedDot(NestedParserOptionsAbstract): def __init__(self): - self._reg_spliter = re.compile(r"^([^\[\]\.]+)(.*?)((?:\.)|(?:\[\]))?$") - self._reg_options = re.compile(r"(\[\d+\])|(\.[^\[\]\.]+)") + self._reg_spliter = cache_regex_compile( + r"^([^\[\]\.]+)(.*?)((?:\.)|(?:\[\]))?$" + ) + self._reg_options = cache_regex_compile(r"(\[\d+\])|(\.[^\[\]\.]+)") def sanitize(self, key, value): first, *lst = self.split(key) @@ -152,8 +157,10 @@ def sanitize(self, key, value): class NestedParserOptionsMixed(NestedParserOptionsMixedDot): def __init__(self): - self._reg_spliter = re.compile(r"^([^\[\]\.]+)(.*?)((?:\.)|(?:\[\]))?$") - self._reg_options = re.compile(r"(\[\d+\])|(\.?[^\[\]\.]+)") + self._reg_spliter = cache_regex_compile( + r"^([^\[\]\.]+)(.*?)((?:\.)|(?:\[\]))?$" + ) + self._reg_options = cache_regex_compile(r"(\[\d+\])|(\.?[^\[\]\.]+)") def sanitize(self, key, value): first, *lst = self.split(key) diff --git a/nested_multipart_parser/parser.py b/nested_multipart_parser/parser.py index 743310e..2d07721 100644 --- a/nested_multipart_parser/parser.py +++ b/nested_multipart_parser/parser.py @@ -1,11 +1,10 @@ -from nested_multipart_parser.declare import NestedDeclare from nested_multipart_parser.options import ( - NestedParserOptionsMixedDot, - NestedParserOptionsMixed, NestedParserOptionsBracket, NestedParserOptionsDot, + NestedParserOptionsMixed, + NestedParserOptionsMixedDot, ) - +from nested_multipart_parser.temp_element import TempDict, TempList DEFAULT_OPTIONS = { "separator": "mixed-dot", @@ -25,19 +24,24 @@ class NestedParser: _valid = None errors = None - def __init__(self, data, options={}): + def __init__(self, data, options=None): self.data = data - self._options = {**DEFAULT_OPTIONS, **options} - - assert self._options["separator"] in ["dot", "bracket", "mixed", "mixed-dot"] + self._options = {**DEFAULT_OPTIONS, **(options or {})} + + assert self._options["separator"] in [ + "dot", + "bracket", + "mixed", + "mixed-dot", + ] assert isinstance(self._options["raise_duplicate"], bool) assert isinstance(self._options["assign_duplicate"], bool) self._cls_options = REGEX_SEPARATOR[self._options["separator"]] def _split_keys(self, data): + checker = self._cls_options() for key, value in data.items(): - checker = self._cls_options() keys, value = checker.sanitize(key, value) checker.check(key, keys) @@ -47,16 +51,16 @@ def convert_value(self, value): return value def construct(self, data): - dictionary = NestedDeclare(dict, self._options) + dictionary = TempDict(self._options) for keys, value in self._split_keys(data): tmp = dictionary for actual_key, next_key in zip(keys, keys[1:]): if isinstance(next_key, int): - tmp[actual_key] = NestedDeclare(list, self._options) + tmp[actual_key] = TempList(self._options) else: - tmp[actual_key] = NestedDeclare(dict, self._options) + tmp[actual_key] = TempDict(self._options) tmp = tmp[actual_key] tmp[keys[-1]] = self.convert_value(value) diff --git a/nested_multipart_parser/temp_element.py b/nested_multipart_parser/temp_element.py new file mode 100644 index 0000000..befc53d --- /dev/null +++ b/nested_multipart_parser/temp_element.py @@ -0,0 +1,73 @@ +import abc +from typing import Any + + +class TempElement(abc.ABC): + @abc.abstractclassmethod + def __setitem__(self, key, val): + """method to set element""" + + def check(self, key, value): + if key in self._elements: + # same instance like templist to templist, we ignore it + if isinstance(self._elements[key], type(value)): + return + + if self._options.get("raise_duplicate"): + raise ValueError("key is already set") + + if not self._options.get("assign_duplicate"): + return + + self._elements[key] = value + + def __getitem__(self, key): + if key not in self._elements: + self[key] = type(self)(options=self._options) + return self._elements[key] + + def conv_value(self, value: Any) -> Any: + if isinstance(value, TempElement): + value = value.convert() + return value + + @abc.abstractmethod + def convert(self): + """method to convert tempoary element to real python element""" + + +class TempList(TempElement): + def __init__(self, options=None): + self._options = options or {} + self._elements = {} + + def __setitem__(self, key: int, value: Any): + assert isinstance(key, int), ( + f"Invalid key for list, need to be int, type={type(key)}" + ) + self.check(key, value) + + def convert(self) -> list: + keys = sorted(self._elements.keys()) + # check if index start to 0 and end to number of elements + if any((keys[0] != 0, keys[-1] != (len(self._elements) - 1))): + raise ValueError("invalid format list keys") + + return [self.conv_value(self._elements[key]) for key in keys] + + +class TempDict(TempElement): + def __init__(self, options=None): + self._options = options or {} + self._elements = {} + + def __setitem__(self, key: str, value: Any): + assert isinstance(key, str), ( + f"Invalid key for dict, need to be str, type={type(key)}" + ) + self.check(key, value) + + def convert(self) -> dict: + return { + key: self.conv_value(value) for key, value in self._elements.items() + } diff --git a/setup.py b/setup.py index 418c7aa..b832a4d 100755 --- a/setup.py +++ b/setup.py @@ -1,17 +1,18 @@ #!/usr/bin/env python3 -import setuptools import os -import sys import subprocess +import sys + +import setuptools -version = "1.5.0" +version = "1.6.0" -if sys.argv[-1] == 'publish': +if sys.argv[-1] == "publish": if os.system("pip freeze | grep twine"): print("twine not installed.\nUse `pip install twine`.\nExiting.") sys.exit() - os.system('rm -rf dist nested_multipart_parser.egg-info') + os.system("rm -rf dist nested_multipart_parser.egg-info") os.system("python setup.py sdist") if os.system("twine check dist/*"): print("twine check failed. Packages might be outdated.") @@ -21,7 +22,7 @@ sys.exit() -with open("README.md", "r", encoding="utf-8") as fh: +with open("README.md", encoding="utf-8") as fh: long_description = fh.read() setuptools.setup( @@ -35,7 +36,7 @@ long_description_content_type="text/markdown", url="https://github.com/remigermain/nested-multipart-parser", project_urls={ - "Bug Tracker": "https://github.com/remigermain/nested-multipart-parser/issues", + "Bug Tracker": "https://github.com/remigermain/nested-multipart-parser/issues" }, classifiers=[ "Development Status :: 5 - Production/Stable",