From 2832efafdc1d313e3572ec7c4acf66cb61598adf Mon Sep 17 00:00:00 2001 From: andefined Date: Mon, 26 Apr 2021 09:29:56 +0300 Subject: [PATCH] added Detect method --- plagiarism.go | 22 ++++++ plagiarism_test.go | 162 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 182 insertions(+), 2 deletions(-) diff --git a/plagiarism.go b/plagiarism.go index dc7f2a2..dd3917e 100644 --- a/plagiarism.go +++ b/plagiarism.go @@ -198,6 +198,28 @@ func (p *Detector) Equal(source, target []string) bool { return true } +// Detect will read values directly from the detector interface bypassing +// GetStopWords and Tokenize methods assuming that you already provided a +// a list of stopwords for each string (source, target). Will return an +// error on failure. +func (p *Detector) Detect() error { + // check if any of source or target stopwords list is an empty string array and return an error + if (len(p.SourceStopWords) < 1 || len(p.TargetStopWords) < 1) && (p.SourceText == "" || p.TargetText == "") { + return fmt.Errorf("you should at least define source and target texts") + } + + if len(p.SourceStopWords) > 0 && len(p.TargetStopWords) > 0 { + return p.DetectWithStopWords(p.SourceStopWords, p.TargetStopWords) + } + + // check if any of source or target text is an empty string and return an error + if p.SourceText != "" && p.TargetText != "" { + return p.DetectWithStrings(p.SourceText, p.TargetText) + } + + return fmt.Errorf("empty strings cannot continue") +} + // DetectWithStrings returns an error on failure, otherwise will invoke // DetectWithStopWords method. func (p *Detector) DetectWithStrings(source, target string) error { diff --git a/plagiarism_test.go b/plagiarism_test.go index e7d1345..4458e0b 100644 --- a/plagiarism_test.go +++ b/plagiarism_test.go @@ -6,10 +6,38 @@ import ( "testing" ) +var sourceString = `Plagiarism detection using stopwords n-grams. go-plagiarism is the main algorithm +that utilizes MediaWatch and is inspired by Efstathios Stamatatos paper. +We only rely on a list of stopwords to calculate +the plagiarism probability between two texts, in combination with n-gram +loops that let us find, not only plagiarism but also +paraphrase and patchwork plagiarism. In our case (cc MediaWatch) we +use this algorithm to create relationships between similar articles and +map the process, or the chain of misinformation. As our +scope is to track propaganda networks in the news ecosystem, +this algorithm only tested in such context.` + +var sourceStopWords = []string{ + "using", "is", "the", "that", "and", "is", "by", "we", "only", "on", "a", "of", "to", "the", "between", "two", "in", "with", "that", "let", "us", "not", "only", "but", "also", "and", "in", "our", "case", "we", "use", "this", "to", "between", "similar", "and", "the", "or", "the", "of", "as", "our", "is", "to", "in", "the", "this", "only", "in", "such", +} + +var targetString = `We only rely on a list of stopwords to calculate +the plagiarism probability between two texts, in combination with n-gram +loops that let us find, not only plagiarism but also +paraphrase and patchwork plagiarism. In our case (cc MediaWatch) we +use this algorithm to create relationships between similar articles and +map the process, or the chain of misinformation. As our +scope is to track propaganda networks in the news ecosystem, +this algorithm only tested in such context.` + +var targetStopWords = []string{ + "we", "only", "on", "a", "of", "to", "the", "between", "two", "in", "with", "that", "let", "us", "not", "only", "but", "also", "and", "in", "our", "case", "we", "use", "this", "to", "between", "similar", "and", "the", "or", "the", "of", "as", "our", "is", "to", "in", "the", "this", "only", "in", "such", +} + func Test_NewDetector(t *testing.T) { _, err := NewDetector() if err != nil { - t.Errorf("Error while creating detector: %s", err) + t.Fatalf("Error while creating detector: %s", err) } } @@ -96,7 +124,137 @@ func Test_NewDetectorSetLangError(t *testing.T) { } } -func Test_NewDetectorWithStrings(t *testing.T) { +func Test_NewDetectorDetectError(t *testing.T) { + detector := &Detector{} + + detector.N = 8 + detector.Lang = "en" + detector.StopWords = StopWords[detector.Lang].([]string) + + err := detector.Detect() + + if err == nil { + t.Fatalf("Error in Detect, expected Error, got nil") + } +} +func Test_NewDetectorDetectStopWords(t *testing.T) { + detector := &Detector{} + + detector.N = 8 + detector.Lang = "en" + detector.StopWords = StopWords[detector.Lang].([]string) + detector.SourceStopWords = sourceStopWords + detector.TargetStopWords = targetStopWords + + err := detector.Detect() + + if err != nil { + t.Fatalf("Error while creating detector: %s", err.Error()) + } + if detector.Score != 0.9113924050632911 { + t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score) + } + + if detector.Similar != 72 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar) + } + + if detector.Total != 79 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total) + } +} +func Test_NewDetectorDetectStrings(t *testing.T) { + detector := &Detector{} + + detector.N = 8 + detector.Lang = "en" + detector.StopWords = StopWords[detector.Lang].([]string) + detector.SourceText = sourceString + detector.TargetText = targetString + + err := detector.Detect() + + if err != nil { + t.Fatalf("Error while creating detector: %s", err.Error()) + } + if detector.Score != 0.9113924050632911 { + t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score) + } + + if detector.Similar != 72 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar) + } + + if detector.Total != 79 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total) + } +} + +func Test_NewDetectorWithStringsWithStruct(t *testing.T) { + detector := &Detector{} + + detector.N = 8 + detector.Lang = "en" + detector.StopWords = StopWords[detector.Lang].([]string) + + err := detector.DetectWithStrings(sourceString, targetString) + + if err != nil { + t.Fatalf("Error while creating detector: %s", err.Error()) + } + if detector.Score != 0.9113924050632911 { + t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score) + } + + if detector.Similar != 72 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar) + } + + if detector.Total != 79 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total) + } +} +func Test_NewDetectorWithString(t *testing.T) { + detector, _ := NewDetector() + err := detector.DetectWithStrings(sourceString, targetString) + + if err != nil { + t.Fatalf("Error while creating detector: %s", err.Error()) + } + if detector.Score != 0.9113924050632911 { + t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score) + } + + if detector.Similar != 72 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar) + } + + if detector.Total != 79 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total) + } +} + +func Test_NewDetectorWithStopWords(t *testing.T) { + detector, _ := NewDetector() + err := detector.DetectWithStopWords(sourceStopWords, targetStopWords) + + if err != nil { + t.Fatalf("Error while creating detector: %s", err.Error()) + } + if detector.Score != 0.9113924050632911 { + t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score) + } + + if detector.Similar != 72 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar) + } + + if detector.Total != 79 { + t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total) + } +} + +func Test_NewDetectorWithStringsMany(t *testing.T) { var tests = []struct { lang string source string