diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..9227acd97bb --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,14 @@ +# Contributor Guide + +Synapse is a Python application that has Rust modules via pyo3 for performance. + +## Dev Environment Tips +- Source code is primarily in `synapse/`, tests are in `tests/`. +- Run `poetry install --dev` to install development python dependencies. This will also build and install the Synapse rust code. +- Use `./scripts-dev/lint.sh` to lint the codebase (this attempts to fix issues as well). This should be run and produce no errors before every commit. + +## Testing Instructions +- Find the CI plan in the .github/workflows folder. +- Use `poetry run trial tests` to run all unit tests, or `poetry run trial tests.metrics.test_phone_home_stats.PhoneHomeStatsTestCase` (for example) to run a single test case. The commit should pass all tests before you merge. +- Some typing warnings are expected currently. Fix any test or type *errors* until the whole suite is green. +- Add or update relevant tests for the code you change, even if nobody asked. diff --git a/docs/development/contributing_guide.md b/docs/development/contributing_guide.md index d6efab96cfb..64818d6f12c 100644 --- a/docs/development/contributing_guide.md +++ b/docs/development/contributing_guide.md @@ -29,7 +29,7 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`. -Synapse has an optional, improved user search with better Unicode support. For that you need the development package of `libicu`. On Debian or Ubuntu Linux, this can be installed with `sudo apt install libicu-dev`. +Synapse bundles the ICU library via Rust, so no additional `libicu` package is required for improved user search. The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git). diff --git a/docs/development/dependencies.md b/docs/development/dependencies.md index fa5ff4dcf7f..e381b3d1555 100644 --- a/docs/development/dependencies.md +++ b/docs/development/dependencies.md @@ -164,10 +164,7 @@ $ poetry cache clear --all . # including the wheel artifacts which is not covered by the above command # (see https://github.com/python-poetry/poetry/issues/10304) # -# This is necessary in order to rebuild or fetch new wheels. For example, if you update -# the `icu` library in on your system, you will need to rebuild the PyICU Python package -# in order to incorporate the correct dynamically linked library locations otherwise you -# will run into errors like: `ImportError: libicui18n.so.75: cannot open shared object file: No such file or directory` +# This is necessary in order to rebuild or fetch new wheels. $ rm -rf $(poetry config cache-dir) ``` diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 0853496ab7d..05a557e77ac 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -286,7 +286,7 @@ Installing prerequisites on Ubuntu or Debian: ```sh sudo apt install build-essential python3-dev libffi-dev \ python3-pip python3-setuptools sqlite3 \ - libssl-dev virtualenv libjpeg-dev libxslt1-dev libicu-dev + libssl-dev virtualenv libjpeg-dev libxslt1-dev ``` ##### ArchLinux @@ -295,7 +295,7 @@ Installing prerequisites on ArchLinux: ```sh sudo pacman -S base-devel python python-pip \ - python-setuptools python-virtualenv sqlite3 icu + python-setuptools python-virtualenv sqlite3 ``` ##### CentOS/Fedora @@ -305,8 +305,7 @@ Installing prerequisites on CentOS or Fedora Linux: ```sh sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \ libwebp-devel libxml2-devel libxslt-devel libpq-devel \ - python3-virtualenv libffi-devel openssl-devel python3-devel \ - libicu-devel + python3-virtualenv libffi-devel openssl-devel python3-devel sudo dnf group install "Development Tools" ``` @@ -333,7 +332,7 @@ dnf install python3.12 python3.12-devel ``` Finally, install common prerequisites ```bash -dnf install libicu libicu-devel libpq5 libpq5-devel lz4 pkgconf +dnf install libpq5 libpq5-devel lz4 pkgconf dnf group install "Development Tools" ``` ###### Using venv module instead of virtualenv command @@ -365,19 +364,7 @@ xcode-select --install Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them. -You may need to install icu, and make the icu binaries and libraries accessible. -Please follow [the official instructions of PyICU](https://pypi.org/project/PyICU/) to do so. - -If you're struggling to get icu discovered, and see: -``` - RuntimeError: - Please install pkg-config on your system or set the ICU_VERSION environment - variable to the version of ICU you have installed. -``` -despite it being installed and having your `PATH` updated, you can omit this dependency by -not specifying `--extras all` to `poetry`. If using postgres, you can install Synapse via -`poetry install --extras saml2 --extras oidc --extras postgres --extras opentracing --extras redis --extras sentry`. -ICU is not a hard dependency on getting a working installation. +The ICU library is bundled with Synapse and requires no additional setup. On ARM-based Macs you may also need to install libjpeg and libpq: ```sh @@ -400,8 +387,7 @@ Installing prerequisites on openSUSE: ```sh sudo zypper in -t pattern devel_basis sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \ - python-devel libffi-devel libopenssl-devel libjpeg62-devel \ - libicu-devel + python-devel libffi-devel libopenssl-devel libjpeg62-devel ``` ##### OpenBSD diff --git a/docs/upgrade.md b/docs/upgrade.md index d508e2231e3..1d07d00e98d 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -117,6 +117,13 @@ each upgrade are complete before moving on to the next upgrade, to avoid stacking them up. You can monitor the currently running background updates with [the Admin API](usage/administration/admin_api/background_updates.html#status). +# Upgrading to v1.131.0 + +## ICU bundled with Synapse + +Synapse now uses the Rust `icu` library for improved user search. Installing the +native ICU library on your system is no longer required. + # Upgrading to v1.130.0 ## Documented endpoint which can be delegated to a federation worker @@ -516,11 +523,11 @@ For all other installation methods, no acction is required. This version introduces optional support for an [improved user search dealing with Unicode characters](https://github.com/matrix-org/synapse/pull/14464). If you want to take advantage of this feature you need to install PyICU, -the ICU native dependency and its development headers -so that PyICU can build since no prebuilt wheels are available. +the ICU native dependency and its development headers so that PyICU can build +since no prebuilt wheels are available. -You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do so, -and then do `pip install matrix-synapse[user-search]` for a PyPI install. +You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do +so, and then do `pip install matrix-synapse[user-search]` for a PyPI install. Docker images and Debian packages need nothing specific as they already include or specify ICU as an explicit dependency. diff --git a/poetry.lock b/poetry.lock index cbed01b5646..75696f8ce48 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -39,7 +39,7 @@ description = "The ultimate Python library in building OAuth and OpenID Connect optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"jwt\" or extra == \"oidc\"" +markers = "extra == \"oidc\" or extra == \"jwt\" or extra == \"all\"" files = [ {file = "authlib-1.5.2-py2.py3-none-any.whl", hash = "sha256:8804dd4402ac5e4a0435ac49e0b6e19e395357cfa632a3f624dcb4f6df13b4b1"}, {file = "authlib-1.5.2.tar.gz", hash = "sha256:fe85ec7e50c5f86f1e2603518bb3b4f632985eb4a355e52256530790e326c512"}, @@ -451,7 +451,7 @@ description = "XML bomb protection for Python stdlib modules" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -494,7 +494,7 @@ description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and l optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "elementpath-4.1.5-py3-none-any.whl", hash = "sha256:2ac1a2fb31eb22bbbf817f8cf6752f844513216263f0e3892c8e79782fe4bb55"}, {file = "elementpath-4.1.5.tar.gz", hash = "sha256:c2d6dc524b29ef751ecfc416b0627668119d8812441c555d7471da41d4bacb8d"}, @@ -544,7 +544,7 @@ description = "Python wrapper for hiredis" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "hiredis-3.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:2892db9db21f0cf7cc298d09f85d3e1f6dc4c4c24463ab67f79bc7a006d51867"}, {file = "hiredis-3.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:93cfa6cc25ee2ceb0be81dc61eca9995160b9e16bdb7cca4a00607d57e998918"}, @@ -890,7 +890,7 @@ description = "Jaeger Python OpenTracing Tracer implementation" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"}, ] @@ -1028,7 +1028,7 @@ description = "A strictly RFC 4510 conforming LDAP V3 pure Python client library optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "ldap3-2.9.1-py2.py3-none-any.whl", hash = "sha256:5869596fc4948797020d3f03b7939da938778a0f9e2009f7a072ccf92b8e8d70"}, {file = "ldap3-2.9.1.tar.gz", hash = "sha256:f3e7fc4718e3f09dda568b57100095e0ce58633bcabbed8667ce3f8fbaa4229f"}, @@ -1044,7 +1044,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"url-preview\"" +markers = "extra == \"url-preview\" or extra == \"all\"" files = [ {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e7bc6df34d42322c5289e37e9971d6ed114e3776b45fa879f734bded9d1fea9c"}, {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6854f8bd8a1536f8a1d9a3655e6354faa6406621cf857dc27b681b69860645c7"}, @@ -1324,7 +1324,7 @@ description = "An LDAP3 auth provider for Synapse" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "matrix-synapse-ldap3-0.3.0.tar.gz", hash = "sha256:8bb6517173164d4b9cc44f49de411d8cebdb2e705d5dd1ea1f38733c4a009e1d"}, {file = "matrix_synapse_ldap3-0.3.0-py3-none-any.whl", hash = "sha256:8b4d701f8702551e98cc1d8c20dbed532de5613584c08d0df22de376ba99159d"}, @@ -1545,7 +1545,7 @@ description = "OpenTracing API for Python. See documentation at http://opentraci optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"}, ] @@ -1714,7 +1714,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"postgres\"" +markers = "extra == \"postgres\" or extra == \"all\"" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, {file = "psycopg2-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:c6f7b8561225f9e711a9c47087388a97fdc948211c10a4bccbf0ba68ab7b3b5a"}, @@ -1735,7 +1735,7 @@ description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=mas optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] @@ -1751,7 +1751,7 @@ description = "A Simple library to enable psycopg2 compatability" optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-compat-1.1.tar.gz", hash = "sha256:d25e921748475522b33d13420aad5c2831c743227dc1f1f2585e0fdb5c914e05"}, ] @@ -1967,18 +1967,6 @@ files = [ [package.extras] plugins = ["importlib-metadata ; python_version < \"3.8\""] -[[package]] -name = "pyicu" -version = "2.14" -description = "Python extension wrapping the ICU C++ API" -optional = true -python-versions = "*" -groups = ["main"] -markers = "extra == \"all\" or extra == \"user-search\"" -files = [ - {file = "PyICU-2.14.tar.gz", hash = "sha256:acc7eb92bd5c554ed577249c6978450a4feda0aa6f01470152b3a7b382a02132"}, -] - [[package]] name = "pyjwt" version = "2.6.0" @@ -2023,7 +2011,7 @@ description = "A development tool to measure, monitor and analyze the memory beh optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"cache-memory\"" +markers = "extra == \"cache-memory\" or extra == \"all\"" files = [ {file = "Pympler-1.0.1-py3-none-any.whl", hash = "sha256:d260dda9ae781e1eab6ea15bacb84015849833ba5555f141d2d9b7b7473b307d"}, {file = "Pympler-1.0.1.tar.gz", hash = "sha256:993f1a3599ca3f4fcd7160c7545ad06310c9e12f70174ae7ae8d4e25f6c5d3fa"}, @@ -2083,7 +2071,7 @@ description = "Python implementation of SAML Version 2 Standard" optional = true python-versions = ">=3.9,<4.0" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pysaml2-7.5.0-py3-none-any.whl", hash = "sha256:bc6627cc344476a83c757f440a73fda1369f13b6fda1b4e16bca63ffbabb5318"}, {file = "pysaml2-7.5.0.tar.gz", hash = "sha256:f36871d4e5ee857c6b85532e942550d2cf90ea4ee943d75eb681044bbc4f54f7"}, @@ -2108,7 +2096,7 @@ description = "Extensions to the standard Python datetime module" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2136,7 +2124,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pytz-2022.7.1-py2.py3-none-any.whl", hash = "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"}, {file = "pytz-2022.7.1.tar.gz", hash = "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0"}, @@ -2500,7 +2488,7 @@ description = "Python client for Sentry (https://sentry.io)" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"sentry\"" +markers = "extra == \"sentry\" or extra == \"all\"" files = [ {file = "sentry_sdk-2.22.0-py2.py3-none-any.whl", hash = "sha256:3d791d631a6c97aad4da7074081a57073126c69487560c6f8bffcf586461de66"}, {file = "sentry_sdk-2.22.0.tar.gz", hash = "sha256:b4bf43bb38f547c84b2eadcefbe389b36ef75f3f38253d7a74d6b928c07ae944"}, @@ -2688,7 +2676,7 @@ description = "Tornado IOLoop Backed Concurrent Futures" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"}, {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"}, @@ -2704,7 +2692,7 @@ description = "Python bindings for the Apache Thrift RPC system" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "thrift-0.16.0.tar.gz", hash = "sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408"}, ] @@ -2766,7 +2754,7 @@ description = "Tornado is a Python web framework and asynchronous networking lib optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "tornado-6.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:f81067dad2e4443b015368b24e802d0083fecada4f0a4572fdb72fc06e54a9a6"}, {file = "tornado-6.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9ac1cbe1db860b3cbb251e795c701c41d343f06a96049d6274e7c77559117e41"}, @@ -2901,7 +2889,7 @@ description = "non-blocking redis client for python" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "txredisapi-1.4.11-py3-none-any.whl", hash = "sha256:ac64d7a9342b58edca13ef267d4fa7637c1aa63f8595e066801c1e8b56b22d0b"}, {file = "txredisapi-1.4.11.tar.gz", hash = "sha256:3eb1af99aefdefb59eb877b1dd08861efad60915e30ad5bf3d5bf6c5cedcdbc6"}, @@ -3244,7 +3232,7 @@ description = "An XML Schema validator and decoder" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "xmlschema-2.4.0-py3-none-any.whl", hash = "sha256:dc87be0caaa61f42649899189aab2fd8e0d567f2cf548433ba7b79278d231a4a"}, {file = "xmlschema-2.4.0.tar.gz", hash = "sha256:d74cd0c10866ac609e1ef94a5a69b018ad16e39077bc6393408b40c6babee793"}, @@ -3371,7 +3359,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"] +all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -3384,9 +3372,8 @@ sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["idna", "parameterized"] url-preview = ["lxml"] -user-search = ["pyicu"] [metadata] lock-version = "2.1" python-versions = "^3.9.0" -content-hash = "9824e42dfc0e128129ee0c8641f7fe639bf47574cdd3f052dd995941abc6e44b" +content-hash = "457f188ae22af9663b2ed21f2586720ce5014edc7c34a697787f16aad733ea41" diff --git a/pyproject.toml b/pyproject.toml index d95881b53a3..e00fd151163 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,7 +250,6 @@ hiredis = { version = "*", optional = true } Pympler = { version = "*", optional = true } parameterized = { version = ">=0.7.4", optional = true } idna = { version = ">=2.5", optional = true } -pyicu = { version = ">=2.10.2", optional = true } [tool.poetry.extras] # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified @@ -273,10 +272,6 @@ redis = ["txredisapi", "hiredis"] # Required to use experimental `caches.track_memory_usage` config option. cache-memory = ["pympler"] test = ["parameterized", "idna"] -# Allows for better search for international characters in the user directory. This -# requires libicu's development headers installed on the system (e.g. libicu-dev on -# Debian-based distributions). -user-search = ["pyicu"] # The duplication here is awful. I hate hate hate hate hate it. However, for now I want # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: @@ -308,8 +303,6 @@ all = [ "txredisapi", "hiredis", # cache-memory "pympler", - # improved user search - "pyicu", # omitted: # - test: it's useful to have this separate from dev deps in the olddeps job # - systemd: this is a system-based requirement diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 840988e74eb..3afc4f47c00 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -43,6 +43,7 @@ sha2 = "0.10.8" serde = { version = "1.0.144", features = ["derive"] } serde_json = "1.0.85" ulid = "1.1.2" +icu_segmenter = { version = "2.0", features = ["compiled_data"] } [features] extension-module = ["pyo3/extension-module"] diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d751889874b..b686c5574d1 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -12,6 +12,7 @@ pub mod identifier; pub mod matrix_const; pub mod push; pub mod rendezvous; +pub mod segmenter; lazy_static! { static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init(); @@ -51,6 +52,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { push::register_module(py, m)?; events::register_module(py, m)?; rendezvous::register_module(py, m)?; + segmenter::register_module(py, m)?; Ok(()) } diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs new file mode 100644 index 00000000000..038f37640c8 --- /dev/null +++ b/rust/src/segmenter.rs @@ -0,0 +1,30 @@ +use pyo3::prelude::*; +use icu_segmenter::WordSegmenter; +use icu_segmenter::options::WordBreakInvariantOptions; + +#[pyfunction] +pub fn parse_words(text: &str) -> PyResult> { + let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default()); + let mut parts = Vec::new(); + let mut last = 0usize; + for boundary in segmenter.segment_str(text) { + if boundary > last { + parts.push(text[last..boundary].to_string()); + } + last = boundary; + } + Ok(parts) +} + +pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let child_module = PyModule::new(py, "segmenter")?; + child_module.add_function(wrap_pyfunction!(parse_words, m)?)?; + + m.add_submodule(&child_module)?; + + py.import("sys")? + .getattr("modules")? + .set_item("synapse.synapse_rust.segmenter", child_module)?; + + Ok(()) +} diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 2b867cdb6ea..2c9427d7af0 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -37,15 +37,8 @@ import attr -try: - # Figure out if ICU support is available for searching users. - import icu - - USE_ICU = True -except ModuleNotFoundError: - USE_ICU = False - from synapse.api.errors import StoreError +from synapse.synapse_rust import segmenter as icu from synapse.util.stringutils import non_null_str_or_none if TYPE_CHECKING: @@ -1270,12 +1263,7 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: def _parse_words(search_term: str) -> List[str]: - """Split the provided search string into a list of its words. - - If support for ICU (International Components for Unicode) is available, use it. - Otherwise, fall back to using a regex to detect word boundaries. This latter - solution works well enough for most latin-based languages, but doesn't work as well - with other languages. + """Split the provided search string into a list of its words using ICU. Args: search_term: The search string. @@ -1283,10 +1271,7 @@ def _parse_words(search_term: str) -> List[str]: Returns: A list of the words in the search string. """ - if USE_ICU: - return _parse_words_with_icu(search_term) - - return _parse_words_with_regex(search_term) + return _parse_words_with_icu(search_term) def _parse_words_with_regex(search_term: str) -> List[str]: @@ -1308,21 +1293,10 @@ def _parse_words_with_icu(search_term: str) -> List[str]: A list of the words in the search string. """ results = [] - breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) - breaker.setText(search_term) - i = 0 - while True: - j = breaker.nextBoundary() - if j < 0: - break - - # We want to make sure that we split on `@` and `:` specifically, as - # they occur in user IDs. - for result in re.split(r"[@:]+", search_term[i:j]): + for part in icu.parse_words(search_term): + for result in re.split(r"[@:]+", part): results.append(result.strip()) - i = j - # libicu will break up words that have punctuation in them, but to handle # cases where user IDs have '-', '.' and '_' in them we want to *not* break # those into words and instead allow the DB to tokenise them how it wants. diff --git a/synapse/synapse_rust/segmenter.pyi b/synapse/synapse_rust/segmenter.pyi new file mode 100644 index 00000000000..5f367659479 --- /dev/null +++ b/synapse/synapse_rust/segmenter.pyi @@ -0,0 +1,3 @@ +from typing import List + +def parse_words(text: str) -> List[str]: ... diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index c26932069f8..781832b3fc4 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -44,12 +44,6 @@ from tests.test_utils.event_injection import inject_member_event from tests.unittest import HomeserverTestCase, override_config -try: - import icu -except ImportError: - icu = None # type: ignore - - ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" @@ -451,11 +445,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) - self._restore_use_icu = user_directory.USE_ICU - user_directory.USE_ICU = self.use_icu + self._restore_parse_words = user_directory._parse_words + if not self.use_icu: + user_directory._parse_words = user_directory._parse_words_with_regex def tearDown(self) -> None: - user_directory.USE_ICU = self._restore_use_icu + user_directory._parse_words = self._restore_parse_words def test_search_user_dir(self) -> None: # normally when alice searches the directory she should just find @@ -651,14 +646,8 @@ def test_search_user_dir_accent_insensitivity(self) -> None: class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True - if not icu: - skip = "Requires PyICU" - class UserDirectoryICUTestCase(HomeserverTestCase): - if not icu: - skip = "Requires PyICU" - def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main self.user_dir_helper = GetUserDirectoryTables(self.store)