From a81cc94b76a908911fb5e18236abe183b26d5434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Thu, 14 Nov 2024 11:16:49 +0100 Subject: [PATCH 1/3] Make the mini-batch methods unavailable for TF-IDF There is currently no mini-batch implementation of TF-IDF. To prevent Python from using the methods from the parent class BagOfWords (which would give incorrect results), we add the methods to TF-IDF and raise an error. --- river/feature_extraction/vectorize.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/river/feature_extraction/vectorize.py b/river/feature_extraction/vectorize.py index 5f68450208..66d7454261 100644 --- a/river/feature_extraction/vectorize.py +++ b/river/feature_extraction/vectorize.py @@ -489,3 +489,12 @@ def transform_one(self, x): norm = math.sqrt(sum(tfidf**2 for tfidf in tfidfs.values())) return {term: tfidf / norm for term, tfidf in tfidfs.items()} return tfidfs + + # Mini-batch methods should be done wellâ„¢ and not just be a loop over the *_one equivalent. + def learn_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError + + def transform_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError From 891cdd01f979a14ed8c3cf1be7883cf2dc34bf3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Thu, 14 Nov 2024 11:19:59 +0100 Subject: [PATCH 2/3] Add missing parameters from VectorizerMixin The paramters were documented in the docstring but were not in the constructor. --- river/feature_extraction/vectorize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/river/feature_extraction/vectorize.py b/river/feature_extraction/vectorize.py index 66d7454261..e0a9496730 100644 --- a/river/feature_extraction/vectorize.py +++ b/river/feature_extraction/vectorize.py @@ -451,6 +451,8 @@ def __init__( strip_accents=True, lowercase=True, preprocessor: typing.Callable | None = None, + stop_words: set[str] | None = None, + tokenizer_pattern=r"(?u)\b\w[\w\-]+\b", tokenizer: typing.Callable | None = None, ngram_range=(1, 1), ): @@ -459,6 +461,8 @@ def __init__( strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, + stop_words=stop_words, + tokenizer_pattern=tokenizer_pattern, tokenizer=tokenizer, ngram_range=ngram_range, ) From 3acea43eceabb000d5c50a3c6602bcf0cda257ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Thu, 14 Nov 2024 12:19:07 +0100 Subject: [PATCH 3/3] Changelog entry --- docs/releases/unreleased.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 4ed6dd48f1..5ed9315b52 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -14,6 +14,10 @@ - Make `drift.ADWIN` comply with the reference MOA implementation. +## feature extraction + +- The mini-batch methods for `feature_extraction.TFIDF` now systematically raise an exception, as they are not implemented. + ## stats - Removed the unexported class `stats.CentralMoments`.