TIKA-4278 -- remove colon from default and allow users to customize d… (

#1976) * TIKA-4278 -- remove colon from default and allow users to customize delimiters
apache · Oct 11, 2024 · 5f43b00 · 5f43b00
1 parent b5c802d
commit 5f43b00
Show file tree

Hide file tree

Showing 6 changed files with 145 additions and 34 deletions.
diff --git a/...-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/...-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -95,7 +95,7 @@ CSVResult getBest(Reader reader, Metadata metadata) throws IOException {
         }
         // TIKA-4278: colon isn't reliable, e.g. govdocs1/242/242970.txt
         if (results.size() > 1 && bestResult.getDelimiter().equals(':') &&
-                results.get(1).getConfidence() == bestResult.getConfidence()) {
+                Math.abs(results.get(1).getConfidence() - bestResult.getConfidence()) < 0.0001) {
             return results.get(1);
         }
         return bestResult;

diff --git a/...es/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java b/...es/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TextAndCSVConfig implements Serializable {
+
+    private static final Map<Character, String> DELIMITER_TO_NAME_MAP = new HashMap<>();
+    private static final Map<String, Character> NAME_TO_DELIMITER_MAP = new HashMap<>();
+
+    static {
+        DELIMITER_TO_NAME_MAP.put(',', "comma");
+        DELIMITER_TO_NAME_MAP.put('\t', "tab");
+        DELIMITER_TO_NAME_MAP.put('|', "pipe");
+        DELIMITER_TO_NAME_MAP.put(';', "semicolon");
+    }
+
+    static {
+        for (Map.Entry<Character, String> e : DELIMITER_TO_NAME_MAP.entrySet()) {
+            NAME_TO_DELIMITER_MAP.put(e.getValue(), e.getKey());
+        }
+    }
+
+    private Map<String, Character> nameToDelimiterMap = NAME_TO_DELIMITER_MAP;
+    private Map<Character, String> delimiterToNameMap = DELIMITER_TO_NAME_MAP;
+
+    public Map<String, Character> getNameToDelimiterMap() {
+        return nameToDelimiterMap;
+    }
+
+    public Map<Character, String> getDelimiterToNameMap() {
+        return delimiterToNameMap;
+    }
+
+    public void setNameToDelimiterMap(Map<String, Character> nameToDelimiterMap) {
+        this.nameToDelimiterMap = new HashMap<>(nameToDelimiterMap);
+        this.delimiterToNameMap = new HashMap<>();
+        nameToDelimiterMap.entrySet()
+                          .forEach(e -> delimiterToNameMap.put(e.getValue(), e.getKey()));
+    }
+}
diff --git a/...es/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/...es/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -40,6 +40,7 @@
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
@@ -95,25 +96,9 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
     private static final String TABLE = "table";
     private static final int DEFAULT_MARK_LIMIT = 20000;
 
-    private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>();
-    private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>();
     private static final Set<MediaType> SUPPORTED_TYPES = Collections
             .unmodifiableSet(new HashSet<>(Arrays.asList(CSV, TSV, MediaType.TEXT_PLAIN)));
 
-    static {
-        CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma");
-        CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab");
-        CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe");
-        CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon");
-        CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon");
-    }
-
-    static {
-        for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) {
-            STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
-        }
-    }
-
     /**
      * This is the mark limit in characters (not bytes) to
      * read from the stream when classifying the stream as
@@ -157,6 +142,7 @@ static boolean isCSVOrTSV(MediaType mediaType) {
         return mediaType.getBaseType().equals(TSV) || mediaType.getBaseType().equals(CSV);
     }
 
+    private final TextAndCSVConfig defaultTextAndCSVConfig = new TextAndCSVConfig();
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -165,12 +151,13 @@ public Set<MediaType> getSupportedTypes(ParseContext context) {
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
+        TextAndCSVConfig textAndCSVConfig = context.get(TextAndCSVConfig.class, defaultTextAndCSVConfig);
 
-        CSVParams params = getOverride(metadata);
+        CSVParams params = getOverride(metadata, textAndCSVConfig);
         Reader reader;
         Charset charset;
         if (!params.isComplete()) {
-            reader = detect(params, stream, metadata, context);
+            reader = detect(params, textAndCSVConfig, stream, metadata, context);
             if (params.getCharset() != null) {
                 charset = params.getCharset();
             } else {
@@ -181,7 +168,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
             charset = params.getCharset();
         }
 
-        updateMetadata(params, metadata);
+        updateMetadata(params, metadata, textAndCSVConfig);
 
         //if text or a non-csv/tsv category of text
         //treat this as text and be done
@@ -193,8 +180,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
         }
 
         CSVFormat csvFormat = CSVFormat.EXCEL.builder().setDelimiter(params.getDelimiter()).build();
-        metadata.set(DELIMITER_PROPERTY,
-                CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiterString().charAt(0)));
+        metadata.set(DELIMITER_PROPERTY, textAndCSVConfig.getDelimiterToNameMap().get(csvFormat.getDelimiterString().charAt(0)));
 
         XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
         int totalRows = 0;
@@ -273,7 +259,7 @@ private void handleText(Reader reader, Charset charset, ContentHandler handler,
         xhtml.endDocument();
     }
 
-    private Reader detect(CSVParams params, InputStream stream, Metadata metadata,
+    private Reader detect(CSVParams params, TextAndCSVConfig textAndCSVConfig, InputStream stream, Metadata metadata,
                           ParseContext context) throws IOException, TikaException {
         //if the file was already identified as not .txt, .csv or .tsv
         //don't even try to csv or not
@@ -302,15 +288,15 @@ private Reader detect(CSVParams params, InputStream stream, Metadata metadata,
         if (params.getDelimiter() == null &&
                 (params.getMediaType() == null || isCSVOrTSV(params.getMediaType()))) {
 
-            CSVSniffer sniffer = new CSVSniffer(markLimit, CHAR_TO_STRING_DELIMITER_MAP.keySet(), minConfidence);
+            CSVSniffer sniffer = new CSVSniffer(markLimit, textAndCSVConfig.getDelimiterToNameMap().keySet(), minConfidence);
             CSVResult result = sniffer.getBest(reader, metadata);
             params.setMediaType(result.getMediaType());
             params.setDelimiter(result.getDelimiter());
         }
         return reader;
     }
 
-    private CSVParams getOverride(Metadata metadata) {
+    private CSVParams getOverride(Metadata metadata, TextAndCSVConfig textAndCSVConfig) {
         String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE);
         if (override == null) {
             return new CSVParams();
@@ -332,22 +318,22 @@ private CSVParams getOverride(Metadata metadata) {
             return new CSVParams(mediaType, charset);
         }
 
-        String delimiterString = mediaType.getParameters().get(DELIMITER);
-        if (delimiterString == null) {
+        String delimiterName = mediaType.getParameters().get(DELIMITER);
+        if (delimiterName == null) {
             return new CSVParams(mediaType, charset);
         }
-        if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) {
+        if (textAndCSVConfig.getNameToDelimiterMap().containsKey(delimiterName)) {
             return new CSVParams(mediaType, charset,
-                    (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString));
+                    (char) textAndCSVConfig.getNameToDelimiterMap().get(delimiterName));
         }
-        if (delimiterString.length() == 1) {
-            return new CSVParams(mediaType, charset, delimiterString.charAt(0));
+        if (delimiterName.length() == 1) {
+            return new CSVParams(mediaType, charset, delimiterName.charAt(0));
         }
         //TODO: log bad/unrecognized delimiter string
         return new CSVParams(mediaType, charset);
     }
 
-    private void updateMetadata(CSVParams params, Metadata metadata) {
+    private void updateMetadata(CSVParams params, Metadata metadata, TextAndCSVConfig textAndCSVConfig) {
         MediaType mediaType = null;
         if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
             mediaType = MediaType.TEXT_PLAIN;
@@ -369,8 +355,8 @@ private void updateMetadata(CSVParams params, Metadata metadata) {
             metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
         }
         if (!MediaType.TEXT_PLAIN.equals(mediaType) && params.getDelimiter() != null) {
-            if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
-                attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
+            if (textAndCSVConfig.getDelimiterToNameMap().containsKey(params.getDelimiter())) {
+                attrs.put(DELIMITER, textAndCSVConfig.getDelimiterToNameMap().get(params.getDelimiter()));
             } else {
                 attrs.put(DELIMITER, Integer.toString((int) params.getDelimiter()));
             }
@@ -379,4 +365,16 @@ private void updateMetadata(CSVParams params, Metadata metadata) {
         metadata.set(Metadata.CONTENT_TYPE, type.toString());
     }
 
+    @Field
+    public void setNameToDelimiterMap(Map<String, String> map) throws TikaConfigException {
+        Map<String, Character> m = new HashMap<>();
+        for (Map.Entry<String, String> e : map.entrySet()) {
+            if (e.getValue().length() > 1) {
+                throw new TikaConfigException("delimiter must be a single character: " + e.getValue());
+            }
+            m.put(e.getKey(), e.getValue().charAt(0));
+        }
+        defaultTextAndCSVConfig.setNameToDelimiterMap(m);
+    }
+
 }
diff --git a/...ika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/...ika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -240,6 +240,18 @@ public void testSubclassingMimeTypesRemain() throws Exception {
         assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    @Test
+    public void testCustomizingDelimiter() throws Exception {
+        TikaConfig tikaConfig = null;
+        try (InputStream is = TextAndCSVParserTest.class.getResourceAsStream("/test-configs/tika-config-colon-delimiter.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        XMLResult r = getXML("testColonDelimited.txt", p);
+        assertEquals("colon", r.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertContains("colon", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
     private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
         assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
     }

diff --git a/...s/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml b/...s/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+    <parser class="org.apache.tika.parser.csv.TextAndCSVParser">
+      <params>
+        <param name="nameToDelimiterMap" type="map">
+          <entry key="comma" value=","/>
+          <entry key="colon" value=":"/>
+        </param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
diff --git a/...-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt b/...-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt
@@ -0,0 +1,13 @@
+a:b:c:d
+1:2:3:4
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
-Original file line number
+Diff line change
@@ -0,0 +1,13 @@
+    a:b:c:d
+:2:3:4
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8
+:6:7:8