Skip to content
This repository has been archived by the owner on Dec 24, 2019. It is now read-only.

Commit

Permalink
Improves and fixes documentation classifier
Browse files Browse the repository at this point in the history
Fix regex for getting started

Improves the classifier by including the title as a heading
  • Loading branch information
creat89 committed Sep 16, 2019
1 parent fde416a commit f1ff043
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ public void setUses(List<IMetricProvider> uses) {
public List<String> getIdentifiersOfUses() {
return Arrays.asList(IndexPreparationTransMetricProvider.class.getCanonicalName(),
TopicsTransMetricProvider.class.getCanonicalName());
//Add Maracas
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ Require-Bundle: org.eclipse.core.runtime,
org.eclipse.scava.metricprovider.trans.indexing.preparation,
org.eclipse.scava.metricprovider.trans.documentation,
org.eclipse.scava.nlp.tools.predictions,
org.eclipse.scava.nlp.classifiers.documentation
org.eclipse.scava.nlp.classifiers.documentation,
org.eclipse.scava.nlp.tools.other
Bundle-Activator: org.eclipse.scava.metricprovider.trans.documentation.classification.Activator
Bundle-ActivationPolicy: lazy
Bundle-RequiredExecutionEnvironment: JavaSE-1.8
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.eclipse.scava.metricprovider.trans.documentation.DocumentationTransMetricProvider;
import org.eclipse.scava.metricprovider.trans.documentation.classification.model.DocumentationClassificationTransMetric;
Expand Down Expand Up @@ -35,6 +37,21 @@ public class DocumentationClassificationTransMetricProvider implements ITransien
protected List<IMetricProvider> uses;
protected MetricProviderContext context;

private static Pattern fileRegex;
private static Pattern extensionRegex;
private static Pattern fileNameRegex;
private static Pattern spacing1;
private static Pattern spacing2;

static {
fileRegex=Pattern.compile("([^/]+)$");
extensionRegex=Pattern.compile("\\.[^\\.]+$");
fileNameRegex=Pattern.compile("\\W");
spacing1=Pattern.compile("\\h+");
spacing2=Pattern.compile("(^ | $)");
}


@Override
public String getIdentifier() {
return DocumentationClassificationTransMetricProvider.class.getCanonicalName();
Expand Down Expand Up @@ -133,7 +150,7 @@ public void measure(Project project, ProjectDelta delta, DocumentationClassifica
pdfDocument=true;
else
pdfDocument=false;
multiLabelPrediction = DocumentationClassifier.classify(multiLabelPrediction, pdfDocument);
multiLabelPrediction = DocumentationClassifier.classify(multiLabelPrediction,convertFileNameToText(documentationEntry.getEntryId()), pdfDocument);
documentationEntryClass.getTypes().addAll(multiLabelPrediction.getLabels());
}
else
Expand All @@ -146,6 +163,18 @@ public void measure(Project project, ProjectDelta delta, DocumentationClassifica

}

private String convertFileNameToText(String entryId)
{
Matcher m = fileRegex.matcher(entryId);
if(m.find())
entryId=m.group(0);
entryId=extensionRegex.matcher(entryId).replaceAll("");
entryId=fileNameRegex.matcher(entryId).replaceAll(" ");
entryId=spacing1.matcher(entryId).replaceAll(" ");
entryId=spacing2.matcher(entryId).replaceAll("");
return entryId;
}

private DocumentationEntryClassification findDocumentationEntryClassification (DocumentationClassificationTransMetric db, DocumentationEntry documentationEntry)
{
return findDocumentationEntryClassification(db, documentationEntry.getDocumentationId(), documentationEntry.getEntryId());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package org.eclipse.scava.metricprovider.trans.documentation.model;

import com.mongodb.*;
import java.util.*;
import com.googlecode.pongo.runtime.*;
import com.googlecode.pongo.runtime.querying.*;
import com.googlecode.pongo.runtime.Pongo;
import com.googlecode.pongo.runtime.querying.StringQueryProducer;


public class DocumentationEntry extends Pongo {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
#2#quick start guides
#2#quick start guide
#2#a quick start guide to
#2#quick starter guide
######REGEX
(:?(?:guide|documentation|manual|reference|tutorial|basics|introduction)s? to )?getting started(?: with)?
(?:a )quick.start (?:guide|documentation|manual|reference|tutorial|basics|introduction)s?(?: to)?
(?:a )?quick.start(?:er)? (?:guide|documentation|manual|reference|tutorial|basics|introduction)s?(?: to)?
(?:(?:(?:how.|(?:simple way ))to)|(?:before you)) start
(?:(?:introduction|start|beginner) )?tutorials?

Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,24 @@ public static MultiLabelPrediction classify(MultiLabelPrediction documentation,
return documentation;
}

/**
*
* @param documentation The text in the MultLabelPredict is expected to be in HTML format, as it is
* the only way to determine the headings of a file.
* @param fileName The name of the File in a processed format
* @param fromPDF
* @return
*/
public static MultiLabelPrediction classify(MultiLabelPrediction documentation, String fileName, boolean fromPDF)
{
List<String> headings = HtmlHeadingsFinder.find(documentation.getText(), fromPDF);
Set<String> documentationClasses = Collections.synchronizedSet(new HashSet<String>(headings.size()));
headings.add(fileName);
headings.parallelStream().forEach(h->documentationClasses.add(findDocumentationClass(h)));
documentationClasses.remove("");
documentation.setLabels(new ArrayList<String>(documentationClasses));
return documentation;
}


}

0 comments on commit f1ff043

Please sign in to comment.