From c290d828f6260fba69270e80ec1b9a15c5d1fda6 Mon Sep 17 00:00:00 2001 From: Hajime Senuma Date: Fri, 24 Jan 2025 00:06:39 +0900 Subject: [PATCH] Improve hash128, hash64, and hash_bytes (#116) --- CHANGELOG.md | 8 ++ README.md | 26 ++---- docs/api.md | 2 +- pyproject.toml | 2 +- src/mmh3/mmh3module.c | 195 +++++++++++++++++++++++++++++++++++++----- 5 files changed, 192 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17c659b..9509cb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,13 @@ This project has adhered to [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) since version 3.0.0. +## [Unreleased] + +### Added + +- Improve the performance of `hash128()`, `hash64()`, and `hash_bytes()` + by using METH_FASTCALL, reducing the overhead of function calls. + ## [5.0.1] - 2024-09-22 ### Fixed @@ -267,6 +274,7 @@ only. [Softpedia collected mmh3 1.0 on April 27, 2011](https://web.archive.org/web/20110430172027/https://linux.softpedia.com/get/Programming/Libraries/mmh3-68314.shtml), it must have been uploaded to PyPI on or slightly before this date. +[Unreleased]: https://github.com/hajimes/mmh3/compare/v5.0.1...HEAD [5.0.1]: https://github.com/hajimes/mmh3/compare/v5.0.0...v5.0.1 [5.0.0]: https://github.com/hajimes/mmh3/compare/v4.1.0...v5.0.0 [4.1.0]: https://github.com/hajimes/mmh3/compare/v4.0.1...v4.1.0 diff --git a/README.md b/README.md index c7ff4e4..474fecc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # mmh3 -[![Documentation Status](https://readthedocs.org/projects/mmh3/badge/?version=latest)](https://mmh3.readthedocs.io/en/latest/?badge=latest) +[![Documentation Status](https://readthedocs.org/projects/mmh3/badge/?version=stable)](https://mmh3.readthedocs.io/en/latest/?badge=stable) [![GitHub Super-Linter](https://github.com/hajimes/mmh3/actions/workflows/superlinter.yml/badge.svg?branch=master)](https://github.com/hajimes/mmh3/actions?query=workflow%3ASuper-Linter+branch%3Amaster) [![Build](https://github.com/hajimes/mmh3/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/hajimes/mmh3/actions/workflows/build.yml?branch=master) [![PyPi Version](https://img.shields.io/pypi/v/mmh3.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/mmh3/) @@ -80,6 +80,13 @@ in the API Reference for more information. See [Changelog](https://mmh3.readthedocs.io/en/latest/changelog.html) for the complete changelog. +### [Unreleased] + +#### Added + +- Improve the performance of `hash128()`, `hash64()`, and `hash_bytes()` + by using METH_FASTCALL, reducing the overhead of function calls. + ### [5.0.1] - 2024-09-22 #### Fixed @@ -138,21 +145,6 @@ complete changelog. [#77](https://github.com/hajimes/mmh3/pull/77), [#84](https://github.com/hajimes/mmh3/pull/84)). -### [4.1.0] - 2024-01-09 - -#### Added - -- Add support for Python 3.12. - -#### Fixed - -- Fix issues with Bazel by changing the directory structure of the project - ([#50](https://github.com/hajimes/mmh3/issues/50)). -- Fix incorrect type hints ([#51](https://github.com/hajimes/mmh3/issues/51)). -- Fix invalid results on s390x when the arg `x64arch` of `hash64` or - `hash_bytes()` is set to `False` - ([#52](https://github.com/hajimes/mmh3/issues/52)). - ## License [MIT](https://github.com/hajimes/mmh3/blob/master/LICENSE), unless otherwise @@ -248,6 +240,6 @@ is useful for OSINT and cybersecurity activities. - : Python bindings for xxHash (Yue Du) +[Unreleased]: https://github.com/hajimes/mmh3/compare/v5.0.1...HEAD [5.0.1]: https://github.com/hajimes/mmh3/compare/v5.0.0...v5.0.1 [5.0.0]: https://github.com/hajimes/mmh3/compare/v4.1.0...v5.0.0 -[4.1.0]: https://github.com/hajimes/mmh3/compare/v4.0.1...v4.1.0 diff --git a/docs/api.md b/docs/api.md index 7e9dcf8..f07065d 100644 --- a/docs/api.md +++ b/docs/api.md @@ -40,7 +40,7 @@ UTF-8 encoding before hashing. Although `hash128()`, `hash64()`, and `mmh3.hash_bytes()` are provided for compatibility with previous versions and are not marked for deprecation, -the [buffer-accepting hashe functions](#buffer-accepting-hash-functions) +the [buffer-accepting hash functions](#buffer-accepting-hash-functions) introduced in version 5.0.0 are recommended for new code. ```{eval-rst} diff --git a/pyproject.toml b/pyproject.toml index c50608a..529fa04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "mmh3" -version = "5.0.2-dev.1" +version = "5.1.0-dev.1" description = "Python extension for MurmurHash (MurmurHash3), a set of fast and robust hash functions." readme = "README.md" license = {file = "LICENSE"} diff --git a/src/mmh3/mmh3module.c b/src/mmh3/mmh3module.c index 3efc6ea..d316963 100644 --- a/src/mmh3/mmh3module.c +++ b/src/mmh3/mmh3module.c @@ -358,13 +358,17 @@ PyDoc_STRVAR( " tuple[int, int]: The hash value as a tuple of two 64-bit " "integers.\n" "\n" + ".. versionchanged:: 5.1.0\n" + " Performance improvements.\n" + "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` and ``signed`` arguments has been\n" " changed from ``bool`` to ``Any``.\n"); static PyObject * -mmh3_hash64(PyObject *self, PyObject *args, PyObject *keywds) +mmh3_hash64(PyObject *self, PyObject *const *args, Py_ssize_t nargs, + PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; @@ -373,17 +377,66 @@ mmh3_hash64(PyObject *self, PyObject *args, PyObject *keywds) int x64arch = 1; int is_signed = 1; - static char *kwlist[] = {"key", "seed", "x64arch", "signed", NULL}; - static char *valflag[] = {"KK", "LL"}; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|Lpp", kwlist, - &target_str, &target_str_len, &seed, - &x64arch, &is_signed)) { + if ((nargs < 1) && kwnames == NULL) { + PyErr_SetString(PyExc_TypeError, + "function missing required argument 'key' (pos 1)"); return NULL; } - MMH3_VALIDATE_SEED_RETURN_NULL(seed); + if (nargs > 4) { + PyErr_Format(PyExc_TypeError, + "function takes at most 4 arguments (%d given)", + (int)nargs); + return NULL; + } + + if (nargs >= 1) { + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); + } + + if (nargs >= 2) { + MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); + } + + if (nargs >= 3) { + x64arch = PyObject_IsTrue(args[2]); + } + + if (nargs >= 4) { + is_signed = PyObject_IsTrue(args[2]); + } + + if (kwnames) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { + const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); + if (strcmp(kwname, "key") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, + target_str_len); + } + else if (strcmp(kwname, "seed") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); + MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); + } + else if (strcmp(kwname, "x64arch") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); + x64arch = PyObject_IsTrue(args[nargs + i]); + } + else if (strcmp(kwname, "signed") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "signed", 4); + is_signed = PyObject_IsTrue(args[nargs + i]); + } + else { + PyErr_Format( + PyExc_TypeError, + "'%s' is an invalid keyword argument for this function", + kwname); + return NULL; + } + } + } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, (uint32_t)seed, @@ -418,13 +471,17 @@ PyDoc_STRVAR( "Returns:\n" " int: The hash value as a 128-bit integer.\n" "\n" + ".. versionchanged:: 5.1.0\n" + " Performance improvements.\n" + "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` and ``signed`` arguments has been\n" " changed from ``bool`` to ``Any``.\n"); static PyObject * -mmh3_hash128(PyObject *self, PyObject *args, PyObject *keywds) +mmh3_hash128(PyObject *self, PyObject *const *args, Py_ssize_t nargs, + PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; @@ -433,15 +490,64 @@ mmh3_hash128(PyObject *self, PyObject *args, PyObject *keywds) int x64arch = 1; int is_signed = 0; - static char *kwlist[] = {"key", "seed", "x64arch", "signed", NULL}; + if ((nargs < 1) && kwnames == NULL) { + PyErr_SetString(PyExc_TypeError, + "function missing required argument 'key' (pos 1)"); + return NULL; + } - if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|Lpp", kwlist, - &target_str, &target_str_len, &seed, - &x64arch, &is_signed)) { + if (nargs > 4) { + PyErr_Format(PyExc_TypeError, + "function takes at most 4 arguments (%d given)", + (int)nargs); return NULL; } - MMH3_VALIDATE_SEED_RETURN_NULL(seed); + if (nargs >= 1) { + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); + } + + if (nargs >= 2) { + MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); + } + + if (nargs >= 3) { + x64arch = PyObject_IsTrue(args[2]); + } + + if (nargs >= 4) { + is_signed = PyObject_IsTrue(args[2]); + } + + if (kwnames) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { + const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); + if (strcmp(kwname, "key") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, + target_str_len); + } + else if (strcmp(kwname, "seed") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); + MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); + } + else if (strcmp(kwname, "x64arch") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); + x64arch = PyObject_IsTrue(args[nargs + i]); + } + else if (strcmp(kwname, "signed") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "signed", 4); + is_signed = PyObject_IsTrue(args[nargs + i]); + } + else { + PyErr_Format( + PyExc_TypeError, + "'%s' is an invalid keyword argument for this function", + kwname); + return NULL; + } + } + } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, seed, result); @@ -484,13 +590,17 @@ PyDoc_STRVAR( " bytes: The hash value as the ``bytes`` type with a length of 16\n" " bytes (128 bits).\n") "\n" + ".. versionchanged:: 5.1.0\n" + " Performance improvements.\n" + "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` argument has been changed from\n" " ``bool`` to ``Any``.\n"; static PyObject * -mmh3_hash_bytes(PyObject *self, PyObject *args, PyObject *keywds) +mmh3_hash_bytes(PyObject *self, PyObject *const *args, Py_ssize_t nargs, + PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; @@ -498,15 +608,56 @@ mmh3_hash_bytes(PyObject *self, PyObject *args, PyObject *keywds) uint64_t result[2]; int x64arch = 1; - static char *kwlist[] = {"key", "seed", "x64arch", NULL}; + if ((nargs < 1) && kwnames == NULL) { + PyErr_SetString(PyExc_TypeError, + "function missing required argument 'key' (pos 1)"); + return NULL; + } - if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|Lp", kwlist, - &target_str, &target_str_len, &seed, - &x64arch)) { + if (nargs > 3) { + PyErr_Format(PyExc_TypeError, + "function takes at most 3 arguments (%d given)", + (int)nargs); return NULL; } - MMH3_VALIDATE_SEED_RETURN_NULL(seed); + if (nargs >= 1) { + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); + } + + if (nargs >= 2) { + MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); + } + + if (nargs >= 3) { + x64arch = PyObject_IsTrue(args[2]); + } + + if (kwnames) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { + const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); + if (strcmp(kwname, "key") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); + MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, + target_str_len); + } + else if (strcmp(kwname, "seed") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); + MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); + } + else if (strcmp(kwname, "x64arch") == 0) { + MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); + x64arch = PyObject_IsTrue(args[nargs + i]); + } + else { + PyErr_Format( + PyExc_TypeError, + "'%s' is an invalid keyword argument for this function", + kwname); + return NULL; + } + } + } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, seed, result); @@ -1085,11 +1236,11 @@ static PyMethodDef Mmh3Methods[] = { mmh3_hash_doc}, {"hash_from_buffer", (PyCFunction)mmh3_hash_from_buffer, METH_VARARGS | METH_KEYWORDS, mmh3_hash_from_buffer_doc}, - {"hash64", (PyCFunction)mmh3_hash64, METH_VARARGS | METH_KEYWORDS, + {"hash64", (PyCFunction)mmh3_hash64, METH_FASTCALL | METH_KEYWORDS, mmh3_hash64_doc}, - {"hash128", (PyCFunction)mmh3_hash128, METH_VARARGS | METH_KEYWORDS, + {"hash128", (PyCFunction)mmh3_hash128, METH_FASTCALL | METH_KEYWORDS, mmh3_hash128_doc}, - {"hash_bytes", (PyCFunction)mmh3_hash_bytes, METH_VARARGS | METH_KEYWORDS, + {"hash_bytes", (PyCFunction)mmh3_hash_bytes, METH_FASTCALL | METH_KEYWORDS, mmh3_hash_bytes_doc}, {"mmh3_32_digest", (PyCFunction)mmh3_mmh3_32_digest, METH_FASTCALL, mmh3_mmh3_32_digest_doc},