From 6700557adcfc4646654a2acf3db86ddb6f38b56e Mon Sep 17 00:00:00 2001 From: Jan Ehmueller Date: Fri, 3 Nov 2017 15:13:15 +0100 Subject: [PATCH] Closes #614: Add event table (#615) * Refs #614: adds cle keyspace and events table * Refs #614: fixes index out of bounds error by using slice instead of substring --- cassandra.cql | 19 +++++++++++++++++-- .../re/RelationSentenceParser.scala | 3 +-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cassandra.cql b/cassandra.cql index 4412bd5a..121fb40a 100644 --- a/cassandra.cql +++ b/cassandra.cql @@ -489,7 +489,6 @@ CREATE TABLE datalake.versiondiff ( - -- Indices CREATE CUSTOM INDEX ON datalake.subject (name) USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = { @@ -578,7 +577,7 @@ CREATE TABLE evaluation.featureentries ( staging frozen, score map>>, correct boolean -); -- dropable +); CREATE TABLE evaluation.goldstandard ( id1 uuid PRIMARY KEY, @@ -671,3 +670,19 @@ CREATE TABLE evaluation.version ( program text, timestamp timestamp ); + + + +-- CLE +CREATE KEYSPACE cle WITH REPLICATION = { + 'class' : 'SimpleStrategy', + 'replication_factor' : 3 +}; + +CREATE TABLE cle.events ( + type text, + user text, + timestamp timestamp, + props map, + PRIMARY KEY ((type, user), timestamp) +) WITH CLUSTERING ORDER BY (timestamp DESC); diff --git a/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala b/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala index 319e4462..aa68f63c 100644 --- a/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala +++ b/src/main/scala/de/hpi/ingestion/textmining/re/RelationSentenceParser.scala @@ -21,7 +21,6 @@ import de.hpi.ingestion.dataimport.dbpedia.models.Relation import de.hpi.ingestion.dataimport.wikidata.models.WikidataEntity import de.hpi.ingestion.framework.SparkJob import de.hpi.ingestion.textmining.models.{EntityLink, ParsedWikipediaEntry, Sentence} -import de.hpi.ingestion.textmining.tokenizer.IngestionTokenizer import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import de.hpi.ingestion.textmining.preprocessing.CompanyLinkFilter.extractCompanyPages @@ -107,7 +106,7 @@ object RelationSentenceParser { .sortBy(_.offset) .foreach { entity => entity.offset.foreach { entityOffset => - bagOfWords ++= tokenizer.process(sentence.substring(sentenceOffset, entityOffset)) + bagOfWords ++= tokenizer.process(sentence.slice(sentenceOffset, entityOffset)) sentenceOffset = entityOffset + entity.alias.length } }