Closes #614: Add event table (#615)

* Refs #614: adds cle keyspace and events table * Refs #614: fixes index out of bounds error by using slice instead of substring
bpn1 · Nov 3, 2017 · 6700557 · 6700557
1 parent bbf2fcf
commit 6700557
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 4 deletions.
diff --git a/cassandra.cql b/cassandra.cql
@@ -489,7 +489,6 @@ CREATE TABLE datalake.versiondiff (
 
 
 
-
 -- Indices
 CREATE CUSTOM INDEX ON datalake.subject (name) USING 'org.apache.cassandra.index.sasi.SASIIndex'
     WITH OPTIONS = {
@@ -578,7 +577,7 @@ CREATE TABLE evaluation.featureentries (
     staging frozen<subject>,
     score map<text, frozen<list<double>>>,
     correct boolean
-); -- dropable
+);
 
 CREATE TABLE evaluation.goldstandard (
     id1 uuid PRIMARY KEY,
@@ -671,3 +670,19 @@ CREATE TABLE evaluation.version (
     program text,
     timestamp timestamp
 );
+
+
+
+-- CLE
+CREATE KEYSPACE cle WITH REPLICATION = {
+    'class' : 'SimpleStrategy',
+    'replication_factor' : 3
+};
+
+CREATE TABLE cle.events (
+    type text,
+    user text,
+    timestamp timestamp,
+    props map<text, text>,
+    PRIMARY KEY ((type, user), timestamp)
+) WITH CLUSTERING ORDER BY (timestamp DESC);
diff --git a/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala b/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala
@@ -21,7 +21,6 @@ import de.hpi.ingestion.dataimport.dbpedia.models.Relation
 import de.hpi.ingestion.dataimport.wikidata.models.WikidataEntity
 import de.hpi.ingestion.framework.SparkJob
 import de.hpi.ingestion.textmining.models.{EntityLink, ParsedWikipediaEntry, Sentence}
-import de.hpi.ingestion.textmining.tokenizer.IngestionTokenizer
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import de.hpi.ingestion.textmining.preprocessing.CompanyLinkFilter.extractCompanyPages
@@ -107,7 +106,7 @@ object RelationSentenceParser {
                 .sortBy(_.offset)
                 .foreach { entity =>
                     entity.offset.foreach { entityOffset =>
-                        bagOfWords ++= tokenizer.process(sentence.substring(sentenceOffset, entityOffset))
+                        bagOfWords ++= tokenizer.process(sentence.slice(sentenceOffset, entityOffset))
                         sentenceOffset = entityOffset + entity.alias.length
                     }
                 }