Skip to content

Commit

Permalink
update the training data generation process, remove wrongly hardcoded…
Browse files Browse the repository at this point in the history
… flavor, fix parameters
  • Loading branch information
lfoppiano committed Dec 25, 2024
1 parent fc31e34 commit 57fbe4d
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 10 deletions.
8 changes: 4 additions & 4 deletions grobid-core/src/main/java/org/grobid/core/engines/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -538,9 +538,9 @@ public void createTrainingBlank(File inputFile, String pathRaw, String pathTEI,
* file to be corrected for gold-level training data)
* @param id : an optional ID to be used in the TEI file, -1 if not used
*/
public void createTraining(File inputFile, String pathRaw, String pathTEI, int id) {
public void createTraining(File inputFile, String pathRaw, String pathTEI, int id, GrobidModels.Flavor flavor) {
System.out.println(inputFile.getPath());
Document doc = parsers.getFullTextParser().createTraining(inputFile, pathRaw, pathTEI, id, GrobidModels.Flavor.ARTICLE_LIGHT);
Document doc = parsers.getFullTextParser().createTraining(inputFile, pathRaw, pathTEI, id, flavor);
}

/**
Expand Down Expand Up @@ -629,7 +629,7 @@ public Document fullTextToTEIDoc(DocumentSource documentSource,
* will be included if ind = -1
* @return the number of processed files.
*/
public int batchCreateTraining(String directoryPath, String resultPath, int ind) {
public int batchCreateTraining(String directoryPath, String resultPath, int ind, GrobidModels.Flavor flavor) {
try {
File path = new File(directoryPath);
// we process all pdf files in the directory
Expand All @@ -652,7 +652,7 @@ public boolean accept(File dir, String name) {
}
for (final File pdfFile : refFiles) {
try {
createTraining(pdfFile, resultPath, resultPath, ind + n);
createTraining(pdfFile, resultPath, resultPath, ind + n, flavor);
} catch (final Exception exp) {
LOGGER.error("An error occured while processing the following pdf: "
+ pdfFile.getPath(), exp);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ private void processReferencesDirectory(File[] files, final GrobidMainArgs pGbdA
public void createTraining(final GrobidMainArgs pGbdArgs) {
inferPdfInputPath(pGbdArgs);
inferOutputPath(pGbdArgs);
int result = getEngine().batchCreateTraining(pGbdArgs.getPath2Input(), pGbdArgs.getPath2Output(), -1);
int result = getEngine().batchCreateTraining(pGbdArgs.getPath2Input(), pGbdArgs.getPath2Output(), -1, pGbdArgs.getModelFlavor());
LOGGER.info(result + " files processed.");
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.grobid.core.main.batch;

import java.io.File;
import java.sql.SQLOutput;
import java.util.Arrays;
import java.util.List;

import org.grobid.core.GrobidModels;
import org.grobid.core.engines.ProcessEngine;
import org.grobid.core.main.GrobidHomeFinder;
import org.grobid.core.main.LibraryLoader;
Expand Down Expand Up @@ -168,6 +170,21 @@ protected static boolean processArgs(final String[] pArgs) {
gbdArgs.setRecursive(true);
continue;
}

if (currArg.equals("-flavor")) {
final String command = pArgs[i + 1];

GrobidModels.Flavor flavor = GrobidModels.Flavor.fromLabel(command);
if (flavor != null) {
System.out.println("Setting model flavor to: " + flavor);
gbdArgs.setModelFlavor(flavor);
i++;
continue;
} else {
System.out.println("No model flavor, using the default models");
break;
}
}
}
}
return result;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.grobid.core.main.batch;

import org.grobid.core.GrobidModels;

/**
* Class containing args of the batch {@link GrobidMain}.
*
Expand Down Expand Up @@ -34,7 +36,7 @@ public class GrobidMainArgs {

private boolean addElementId = false;

private String modelFlavor = null;
private GrobidModels.Flavor modelFlavor = null;

/**
* @return the path2grobidHome
Expand Down Expand Up @@ -243,11 +245,11 @@ public final void setSegmentSentences(final boolean pSegmentSentences) {
segmentSentences = pSegmentSentences;
}

public String getModelFlavor() {
public GrobidModels.Flavor getModelFlavor() {
return modelFlavor;
}

public void setModelFlavor(String modelFlavor) {
public void setModelFlavor(GrobidModels.Flavor modelFlavor) {
this.modelFlavor = modelFlavor;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import net.sf.saxon.trans.XPathException;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Figure;
import org.grobid.core.data.Table;
import org.grobid.core.document.Document;
Expand Down Expand Up @@ -64,7 +65,7 @@ private static Set<Integer> getVectorGraphicPages(File pdfaltoDirectory) throws
return blacklistedPages;
}

private static void processPdfFile(File input, File outputFolder) throws Exception {
private static void processPdfFile(File input, File outputFolder, GrobidModels.Flavor flavor) throws Exception {
inputPdf = input;
annotated = false;
annotatedFigure = false;
Expand Down Expand Up @@ -99,7 +100,7 @@ private static void processPdfFile(File input, File outputFolder) throws Excepti

blacklistedPages = getVectorGraphicPages(pdfaltoDirectory);

Document teiDoc = engine.fullTextToTEIDoc(documentSource, null, config);
Document teiDoc = engine.fullTextToTEIDoc(documentSource, flavor, config);

PDDocument out = annotateFigureAndTables(
document, copiedFile, teiDoc,
Expand Down

0 comments on commit 57fbe4d

Please sign in to comment.