Skip to content

Commit

Permalink
added Detect method
Browse files Browse the repository at this point in the history
  • Loading branch information
andefined committed Apr 26, 2021
1 parent 4d72986 commit 2832efa
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 2 deletions.
22 changes: 22 additions & 0 deletions plagiarism.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,28 @@ func (p *Detector) Equal(source, target []string) bool {
return true
}

// Detect will read values directly from the detector interface bypassing
// GetStopWords and Tokenize methods assuming that you already provided a
// a list of stopwords for each string (source, target). Will return an
// error on failure.
func (p *Detector) Detect() error {
// check if any of source or target stopwords list is an empty string array and return an error
if (len(p.SourceStopWords) < 1 || len(p.TargetStopWords) < 1) && (p.SourceText == "" || p.TargetText == "") {
return fmt.Errorf("you should at least define source and target texts")
}

if len(p.SourceStopWords) > 0 && len(p.TargetStopWords) > 0 {
return p.DetectWithStopWords(p.SourceStopWords, p.TargetStopWords)
}

// check if any of source or target text is an empty string and return an error
if p.SourceText != "" && p.TargetText != "" {
return p.DetectWithStrings(p.SourceText, p.TargetText)
}

return fmt.Errorf("empty strings cannot continue")
}

// DetectWithStrings returns an error on failure, otherwise will invoke
// DetectWithStopWords method.
func (p *Detector) DetectWithStrings(source, target string) error {
Expand Down
162 changes: 160 additions & 2 deletions plagiarism_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,38 @@ import (
"testing"
)

var sourceString = `Plagiarism detection using stopwords n-grams. go-plagiarism is the main algorithm
that utilizes MediaWatch and is inspired by Efstathios Stamatatos paper.
We only rely on a list of stopwords to calculate
the plagiarism probability between two texts, in combination with n-gram
loops that let us find, not only plagiarism but also
paraphrase and patchwork plagiarism. In our case (cc MediaWatch) we
use this algorithm to create relationships between similar articles and
map the process, or the chain of misinformation. As our
scope is to track propaganda networks in the news ecosystem,
this algorithm only tested in such context.`

var sourceStopWords = []string{
"using", "is", "the", "that", "and", "is", "by", "we", "only", "on", "a", "of", "to", "the", "between", "two", "in", "with", "that", "let", "us", "not", "only", "but", "also", "and", "in", "our", "case", "we", "use", "this", "to", "between", "similar", "and", "the", "or", "the", "of", "as", "our", "is", "to", "in", "the", "this", "only", "in", "such",
}

var targetString = `We only rely on a list of stopwords to calculate
the plagiarism probability between two texts, in combination with n-gram
loops that let us find, not only plagiarism but also
paraphrase and patchwork plagiarism. In our case (cc MediaWatch) we
use this algorithm to create relationships between similar articles and
map the process, or the chain of misinformation. As our
scope is to track propaganda networks in the news ecosystem,
this algorithm only tested in such context.`

var targetStopWords = []string{
"we", "only", "on", "a", "of", "to", "the", "between", "two", "in", "with", "that", "let", "us", "not", "only", "but", "also", "and", "in", "our", "case", "we", "use", "this", "to", "between", "similar", "and", "the", "or", "the", "of", "as", "our", "is", "to", "in", "the", "this", "only", "in", "such",
}

func Test_NewDetector(t *testing.T) {
_, err := NewDetector()
if err != nil {
t.Errorf("Error while creating detector: %s", err)
t.Fatalf("Error while creating detector: %s", err)
}
}

Expand Down Expand Up @@ -96,7 +124,137 @@ func Test_NewDetectorSetLangError(t *testing.T) {
}
}

func Test_NewDetectorWithStrings(t *testing.T) {
func Test_NewDetectorDetectError(t *testing.T) {
detector := &Detector{}

detector.N = 8
detector.Lang = "en"
detector.StopWords = StopWords[detector.Lang].([]string)

err := detector.Detect()

if err == nil {
t.Fatalf("Error in Detect, expected Error, got nil")
}
}
func Test_NewDetectorDetectStopWords(t *testing.T) {
detector := &Detector{}

detector.N = 8
detector.Lang = "en"
detector.StopWords = StopWords[detector.Lang].([]string)
detector.SourceStopWords = sourceStopWords
detector.TargetStopWords = targetStopWords

err := detector.Detect()

if err != nil {
t.Fatalf("Error while creating detector: %s", err.Error())
}
if detector.Score != 0.9113924050632911 {
t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score)
}

if detector.Similar != 72 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar)
}

if detector.Total != 79 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total)
}
}
func Test_NewDetectorDetectStrings(t *testing.T) {
detector := &Detector{}

detector.N = 8
detector.Lang = "en"
detector.StopWords = StopWords[detector.Lang].([]string)
detector.SourceText = sourceString
detector.TargetText = targetString

err := detector.Detect()

if err != nil {
t.Fatalf("Error while creating detector: %s", err.Error())
}
if detector.Score != 0.9113924050632911 {
t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score)
}

if detector.Similar != 72 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar)
}

if detector.Total != 79 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total)
}
}

func Test_NewDetectorWithStringsWithStruct(t *testing.T) {
detector := &Detector{}

detector.N = 8
detector.Lang = "en"
detector.StopWords = StopWords[detector.Lang].([]string)

err := detector.DetectWithStrings(sourceString, targetString)

if err != nil {
t.Fatalf("Error while creating detector: %s", err.Error())
}
if detector.Score != 0.9113924050632911 {
t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score)
}

if detector.Similar != 72 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar)
}

if detector.Total != 79 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total)
}
}
func Test_NewDetectorWithString(t *testing.T) {
detector, _ := NewDetector()
err := detector.DetectWithStrings(sourceString, targetString)

if err != nil {
t.Fatalf("Error while creating detector: %s", err.Error())
}
if detector.Score != 0.9113924050632911 {
t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score)
}

if detector.Similar != 72 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar)
}

if detector.Total != 79 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total)
}
}

func Test_NewDetectorWithStopWords(t *testing.T) {
detector, _ := NewDetector()
err := detector.DetectWithStopWords(sourceStopWords, targetStopWords)

if err != nil {
t.Fatalf("Error while creating detector: %s", err.Error())
}
if detector.Score != 0.9113924050632911 {
t.Errorf("Error in DetectWithStrings, expected %f, got %f", 0.9113924050632911, detector.Score)
}

if detector.Similar != 72 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 72, detector.Similar)
}

if detector.Total != 79 {
t.Errorf("Error in DetectWithStrings, expected %d, got %d", 79, detector.Total)
}
}

func Test_NewDetectorWithStringsMany(t *testing.T) {
var tests = []struct {
lang string
source string
Expand Down

0 comments on commit 2832efa

Please sign in to comment.