From cf9d5ee5fd50440123dd84f98bf71d03ed6dcccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Fri, 18 Sep 2015 16:44:19 +0200 Subject: [PATCH 01/82] Increase version number to 2.1.0 --- gbuild.sh | 2 +- lima_common/LIMACOMMONConfig-src.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gbuild.sh b/gbuild.sh index f2c875dae..6a2a72591 100755 --- a/gbuild.sh +++ b/gbuild.sh @@ -93,7 +93,7 @@ source_dir=$PWD if [[ $version = "rev" ]]; then release="$current_timestamp-$current_revision" else -release="2" +release="0" fi if [[ $parallel = "true" ]]; then diff --git a/lima_common/LIMACOMMONConfig-src.cmake b/lima_common/LIMACOMMONConfig-src.cmake index 94e79dae5..35e084004 100644 --- a/lima_common/LIMACOMMONConfig-src.cmake +++ b/lima_common/LIMACOMMONConfig-src.cmake @@ -1,7 +1,7 @@ # - Find LimaCommon set(LIMA_VERSION_MAJOR "2") -set(LIMA_VERSION_MINOR "0") +set(LIMA_VERSION_MINOR "1") set(LIMA_VERSION_RELEASE @LIMA_VERSION_RELEASE@) set(LIMA_VERSION "${LIMA_VERSION_MAJOR}.${LIMA_VERSION_MINOR}.${LIMA_VERSION_RELEASE}") From ec292da63b9d0fd1e6c32f00ced95641eb638c26 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 1 Oct 2015 10:29:24 +0200 Subject: [PATCH 02/82] Add guards to avoid storing tokens with null micro --- .../applyRecognizerActions.cpp | 22 +++++++++++++++++ .../AbbreviationSplitAlternatives.cpp | 9 ++++++- .../IdiomaticAlternativesConstraints.cpp | 22 +++++++++++++++-- .../SpecificEntitiesConstraints.cpp | 24 +++++++++++++++---- 4 files changed, 70 insertions(+), 7 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp index 76a93e57b..2d47416b1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp @@ -315,6 +315,13 @@ operator()(RecognizerMatch& result, // create the new token pair newToken= createAlternativeToken(result); + if (newToken.second->empty()) { + APPRLOGINIT; + LERROR << "CreateAlternative::operator(): Got empty morphosyntactic data. Abort."; + delete newToken.first; + delete newToken.second; + return false; + } // LDEBUG << "create alternative token " << newToken.first->stringForm(); // add the vertex @@ -351,6 +358,13 @@ operator()(RecognizerMatch& result, // create the new token pair newToken= createAlternativeToken(result); + if (newToken.second->empty()) { + APPRLOGINIT; + LERROR << "CreateAlternative::operator(): Got empty morphosyntactic data. Abort."; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex altVertex = @@ -374,6 +388,14 @@ operator()(RecognizerMatch& result, // LDEBUG << "duplication vertex " << matchItr->getVertex();; Token* token=get(vertex_token,*graph,matchItr->getVertex()); MorphoSyntacticData* data=new MorphoSyntacticData(*get(vertex_data,*graph,matchItr->getVertex())); + if (data->empty()) + { + // ignore current idiomatic expression, continue + APPRLOGINIT; + LERROR << "CreateAlternative::operator() Got empty morphosyntactic data. Abort"; + delete data; + return false; + } LinguisticGraphVertex dupVx=add_vertex(*graph); put(vertex_token,*graph,dupVx,token); put(vertex_data,*graph,dupVx,data); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp index 524d1ba7d..5fd837217 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp @@ -357,7 +357,14 @@ bool AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternative { LERROR << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor: Cannot find a dictionary entry for abbreviated word " << Lima::Common::Misc::limastring2utf8stdstring(abbrev); } - + if (newData->empty()) + { + MORPHOLOGINIT; + LERROR << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor Got empty morphosyntactic data. Abort."; + delete newFT; + delete newData; + return false; + } // LinguisticGraphVertex afterVertex = listIterator.createVertexFor(newFT); LinguisticGraphVertex afterVertex = add_vertex(*graph); put(vertex_token,*graph,afterVertex,newFT); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp index 09238c128..6d6a834cf 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp @@ -180,13 +180,22 @@ bool CreateIdiomaticAlternative::operator()(Automaton::RecognizerMatch& result, { // ignore current idiomatic expression, continue MORPHOLOGINIT; - LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) + LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) << ": overlapping with a previous one"; return false; } // create the new token std::pair newToken = createAlternativeToken(result); + if (newToken.second->empty()) + { + // ignore current idiomatic expression, continue + MORPHOLOGINIT; + LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort"; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex idiomaticVertex = @@ -241,13 +250,22 @@ bool CreateIdiomaticAlternative::operator()(Automaton::RecognizerMatch& result, { // ignore current idiomatic expression, continue MORPHOLOGINIT; - LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) + LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) << ": overlapping with a previous one"; return false; } // create the new token pair newToken = createAlternativeToken(result); + if (newToken.second->empty()) + { + // ignore current idiomatic expression, continue + MORPHOLOGINIT; + LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort"; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex idiomaticVertex = diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp index f60586b98..d3046e4f5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp @@ -464,6 +464,9 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, elem.type = SPECIFIC_ENTITY; // MorphoSyntacticType if (! m_microsToKeep.empty()) { +#ifdef DEBUG_LP + LDEBUG << "CreateSpecificEntity, use micros from the rule "; +#endif // micros are given in the rules addMicrosToMorphoSyntacticData(newMorphData,dataHead,m_microsToKeep,elem); } @@ -479,11 +482,12 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, { SELOGINIT; LERROR << "CreateSpecificEntity::operator() null group id:" << seType; + delete newMorphData; return false; } - std::string resourceName= - Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityGroupName(seType.getGroupId()))+"Micros"; - AbstractResource* res=LinguisticResources::single().getResource(m_language,resourceName); + const LimaString& resourceName = + Common::MediaticData::MediaticData::single().getEntityGroupName(seType.getGroupId())+"Micros"; + AbstractResource* res=LinguisticResources::single().getResource(m_language,resourceName.toUtf8().constData()); #ifdef DEBUG_LP LDEBUG << "Entities resource name is : " << resourceName; #endif @@ -506,6 +510,8 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, // cannot find micros for this type: error SELOGINIT; LERROR << "CreateSpecificEntity: missing resource " << resourceName ; + delete newMorphData; + return false; } } @@ -522,6 +528,15 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, newToken->setStatus(tokenMap[v1]->status()); //} + if (newMorphData->empty()) + { + SELOGINIT; + LERROR << "CreateSpecificEntity::operator() Found no morphosyntactic data for new vertex. Abort."; + delete newToken; + delete newMorphData; + assert(false); + return false; + } // LDEBUG << " Updating morphologic graph "<< graphId; // creer le noeud et ses 2 arcs LinguisticGraphVertex newVertex; @@ -985,8 +1000,9 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, } } } - if( nbEdges > 1 ) + if( nbEdges > 1 ) { LWARN << "SetEntityFeature:: Warning: ambiguïties in graph"; + } Token* token=get(vertex_token,lGraph,v); if (v == v1) { From d133fde7728625aaf1483fda9cd9d27468caf0cc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 1 Oct 2015 10:34:34 +0200 Subject: [PATCH 03/82] Better guard for a good vertex morph data def --- .../core/SyntacticAnalysis/ChainsDisambiguator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp index 63d4fc55f..0176e6b3a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp @@ -180,7 +180,7 @@ void ChainsDisambiguator::computePaths() const std::set< ChainIdStruct >& nextVertexChains = chainsMap[nextVertex]; LinguisticCode nextMicroCateg(0); const MorphoSyntacticData* nextData = dataMap[nextVertex]; - if (nextData == 0) + if (nextData == 0 || nextData->empty()) { SADLOGINIT; LWARN << "vertex " << nextVertex << " has no data"; From 9b9af97c78441704d796f5a559055e68e30ea635 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 1 Oct 2015 10:36:09 +0200 Subject: [PATCH 04/82] Ensure no morph data with null micro is used Conflicts: lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp --- .../HomoSyntagmaticConstraints.cpp | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp index d3659a79a..c645fc5da 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp @@ -719,6 +719,10 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, m_micro(0), m_tempCompType(0) { +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "CreateCompoundTense::CreateCompoundTense()" << language << complement; +#endif const std::string str= Common::Misc::limastring2utf8stdstring(complement); @@ -728,7 +732,10 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); +#ifdef DEBUG_LP + LDEBUG << "CreateCompoundTense::CreateCompoundTense() m_tempCompType" << m_tempCompType; +#endif m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); m_microAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); @@ -773,7 +780,19 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, Token* tokenAux = tokenMap[auxVertex]; Token* tokenPastPart = tokenMap[pastPartVertex]; const MorphoSyntacticData* dataAux = dataMap[auxVertex]; + if (dataAux->empty()) + { + SAPLOGINIT; + LERROR << "CreateCompoundTense::operator() morphosyntactic data is empty for aux. Abort."; + return false; + } const MorphoSyntacticData* dataPastPart = dataMap[pastPartVertex]; + if (dataPastPart->empty()) + { + SAPLOGINIT; + LERROR << "CreateCompoundTense::operator() morphosyntactic data is empty for past participle. Abort."; + return false; + } LinguisticCode dataAuxMicro = dataAux->firstValue(*m_microAccessor); LinguisticCode tense = static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).compoundTense(dataAuxMicro, dataAux->firstValue(*m_timeAccessor)); @@ -1125,7 +1144,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, m_tempCompType); #ifdef DEBUG_LP - LDEBUG << "CreateCompoundTense: " << res; + LDEBUG << "CreateCompoundTense: " << m_tempCompType << res; #endif RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData == 0) From 288be069286b3582015f946380d7e748b88270ec Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 1 Oct 2015 10:37:29 +0200 Subject: [PATCH 05/82] Corrects "Empty data for vertex" bug When a wrong micro-category was used in the definition of a Modex, a wrong null value was set to the newly created token. This commit writes an error message and avoid to add the wrong value. If there is no other micro associated to this entity, an exception is thrown later on when trying to use the empty list. --- .../core/SpecificEntities/SpecificEntitiesMicros.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp index 1f5c31588..6adb06b10 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp @@ -77,10 +77,17 @@ init(GroupConfigurationStructure& unitConfiguration, EntityType type=static_cast(MediaticData::single()).getEntityType(entityName); for (deque::const_iterator micro=(*it).second.begin(), micro_end=(*it).second.end(); micro!=micro_end; micro++) { + LinguisticCode code = microManager.getPropertyValue(*micro); + if (code == 0) { + SELOGINIT; + LERROR << "SpecificEntitiesMicros::init on entity" << entityName << "," << *micro << "linguistic code is not defined"; + } + else { #ifdef DEBUG_LP - LDEBUG << "Adding " << *micro << microManager.getPropertyValue(*micro) << " to EntityType " << type; + LDEBUG << "Adding " << *micro << code << " to EntityType " << type; #endif - m_micros[type].insert(microManager.getPropertyValue(*micro)); + m_micros[type].insert(code); + } } } catch (LimaException& e) { From d6b1d0aa83bf549cdc89363abe7211f98b59c567 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 7 Oct 2015 23:14:54 +0200 Subject: [PATCH 06/82] Add missing constructors The initializable object constructors and operator= were not implemented, using the default ones, thus the m_id member was not copied. --- .../src/common/AbstractFactoryPattern/InitializableObject.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h b/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h index dade5ab30..8f3060765 100644 --- a/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h +++ b/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h @@ -43,6 +43,10 @@ template class InitializableObject { public: + InitializableObject() : m_id() {} + InitializableObject(const InitializableObject& object) { m_id = object.m_id; } + InitializableObject& operator=(const InitializableObject& object) { m_id = object.m_id; return *this; } + /** * Manager is the type of the Manager associated to the initializableObject. * This type is an instanciation of InitializableObjectManager template with From 59a0a7d206a46fb34e5098ee5eb9c54a906ba993 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 7 Oct 2015 23:52:57 +0200 Subject: [PATCH 07/82] Add missing parent class constructor call --- .../core/LinguisticResources/AbstractResource.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp index 720040ab1..378f40d5f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp @@ -37,7 +37,10 @@ friend class AbstractResource; LimaFileSystemWatcher m_resourceFileWatcher; }; -AbstractResource::AbstractResource( QObject* parent ) : QObject( parent ), m_d(new AbstractResourcePrivate()) +AbstractResource::AbstractResource( QObject* parent ) : + QObject( parent ), + InitializableObject(), + m_d(new AbstractResourcePrivate()) { connect(&m_d->m_resourceFileWatcher,SIGNAL(fileChanged(QString)),this,SIGNAL(resourceFileChanged(QString))); } @@ -47,7 +50,10 @@ AbstractResource::~AbstractResource() delete m_d; } -AbstractResource::AbstractResource(const AbstractResource& r) : QObject(r.parent()), m_d(new AbstractResourcePrivate(*r.m_d)) +AbstractResource::AbstractResource(const AbstractResource& r) : + QObject(r.parent()), + InitializableObject(), + m_d(new AbstractResourcePrivate(*r.m_d)) { connect(&m_d->m_resourceFileWatcher,SIGNAL(fileChanged(QString)),this,SIGNAL(resourceFileChanged(QString))); } From 0fc49f8d151aba7e8545694763ad3bd878ab2bbc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Fri, 13 Nov 2015 13:12:09 +0100 Subject: [PATCH 08/82] Add hunspell spellcheck and correct enchant one --- .../cmake/Modules/FindHUNSPELL.cmake | 34 +++ .../HunspellSpellingAlternatives.cpp | 208 ++++++++++++++++++ .../HunspellSpellingAlternatives.h | 72 ++++++ 3 files changed, 314 insertions(+) create mode 100644 lima_linguisticprocessing/cmake/Modules/FindHUNSPELL.cmake create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h diff --git a/lima_linguisticprocessing/cmake/Modules/FindHUNSPELL.cmake b/lima_linguisticprocessing/cmake/Modules/FindHUNSPELL.cmake new file mode 100644 index 000000000..873f72516 --- /dev/null +++ b/lima_linguisticprocessing/cmake/Modules/FindHUNSPELL.cmake @@ -0,0 +1,34 @@ +# - Try to find HUNSPELL +# Once done this will define +# +# HUNSPELL_FOUND - system has HUNSPELL +# HUNSPELL_INCLUDE_DIR - the HUNSPELL include directory +# HUNSPELL_LIBRARIES - The libraries needed to use HUNSPELL +# HUNSPELL_DEFINITIONS - Compiler switches required for using HUNSPELL + + +IF (HUNSPELL_INCLUDE_DIR AND HUNSPELL_LIBRARIES) + # Already in cache, be silent + SET(HUNSPELL_FIND_QUIETLY TRUE) +ENDIF (HUNSPELL_INCLUDE_DIR AND HUNSPELL_LIBRARIES) + +FIND_PATH(HUNSPELL_INCLUDE_DIR + NAMES hunspell/hunspell.hxx + PATHS + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\GnuWin32\\hunspell;InstallPath]/include" + "C:/Program Files/GnuWin32/include" + ) + +FIND_LIBRARY(HUNSPELL_LIBRARIES + NAMES hunspell-1.3 hunspell-1.2 hunspell + PATHS + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\GnuWin32\\hunspell;InstallPath]/lib" + ) + +# handle the QUIETLY and REQUIRED arguments and set HUNSPELL_FOUND to TRUE if +# all listed variables are TRUE +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(HUNSPELL DEFAULT_MSG HUNSPELL_LIBRARIES HUNSPELL_INCLUDE_DIR) + + +MARK_AS_ADVANCED(HUNSPELL_INCLUDE_DIR HUNSPELL_LIBRARIES) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp new file mode 100644 index 000000000..ceceb312e --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp @@ -0,0 +1,208 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "HunspellSpellingAlternatives.h" +#include "MorphoSyntacticDataHandler.h" + +#include "common/time/traceUtils.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/AnalysisDict/AbstractAnalysisDictionary.h" + +#include + +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::AnalysisDict; +using namespace Lima::LinguisticProcessing::FlatTokenizer; +using namespace Lima::Common::XMLConfigurationFiles; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + + SimpleFactory hunspellSpellingAlternativesFactory(HUNSPELL_SPELLING_ALTERNATIVES_CLASSID); + + +class HunspellSpellingAlternativesPrivate +{ + friend class HunspellSpellingAlternatives; + +public: + HunspellSpellingAlternativesPrivate() : m_hunspell(0) {} + virtual ~HunspellSpellingAlternativesPrivate() {delete m_hunspell;} + + + void setHunspellSpellingAlternatives( + LinguisticAnalysisStructure::Token* token, + LinguisticAnalysisStructure::MorphoSyntacticData* tokenData, + FsaStringsPool& sp); + + AnalysisDict::AbstractAnalysisDictionary* m_dictionary; + MediaId m_language; + Hunspell* m_hunspell; +}; + + +HunspellSpellingAlternatives::HunspellSpellingAlternatives() : m_d(new HunspellSpellingAlternativesPrivate()) +{ + +} + +HunspellSpellingAlternatives::~HunspellSpellingAlternatives() +{ + delete m_d; +} + +void HunspellSpellingAlternatives::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + MORPHOLOGINIT; + LDEBUG << "HunspellSpellingAlternatives::init"; std::string spellDico; + m_d->m_language = manager->getInitializationParameters().media; + + try + { + // try to get a specific spellchecking dictionary name from the config file + spellDico = unitConfiguration.getParamsValueAtKey("spellcheckDictionary"); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'spellcheckDictionary' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language; + throw InvalidConfiguration(); + } + LDEBUG << "HunspellSpellingAlternatives::init requesting Hunspell spellcheck dictionary" << Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/" << spellDico; + if (m_d->m_hunspell != 0) delete m_d->m_hunspell; + m_d->m_hunspell = new Hunspell( (Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"+spellDico+".aff").c_str(), + (Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"+spellDico+".dic").c_str() ); + try + { + std::string dico=unitConfiguration.getParamsValueAtKey("dictionary"); + AbstractResource* res= LinguisticResources::single().getResource(m_d->m_language,dico); + m_d->m_dictionary=static_cast(res); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language; + throw InvalidConfiguration(); + } +} + + +LimaStatusCode HunspellSpellingAlternatives::process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + MORPHOLOGINIT; + LINFO << "MorphologicalAnalysis: starting process HunspellSpellingAlternatives"; + + FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(m_d->m_language); + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + LinguisticGraph* g=tokenList->getGraph(); + VertexDataPropertyMap dataMap=get(vertex_data,*g); + VertexTokenPropertyMap tokenMap=get(vertex_token,*g); + LinguisticGraphVertexIt it,itEnd; + for (boost::tie(it,itEnd)=vertices(*g) ; it != itEnd ; it++) + { + LDEBUG << "HunspellSpellingAlternatives::process processing vertex " << *it; + Token* currentToken=tokenMap[*it]; + MorphoSyntacticData* msd=dataMap[*it]; + + if (currentToken!=0) + { + if (msd->empty()) + { + m_d->setHunspellSpellingAlternatives( + currentToken, + msd, + sp); + } + } + } + LINFO << "MorphologicalAnalysis: ending process HunspellSpellingAlternatives"; + return SUCCESS_ID; +} + +void HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives( + Token* token, + MorphoSyntacticData* tokenData, + FsaStringsPool& sp) +{ + // try to find simple Uncapitalization + MORPHOLOGINIT; + // FIXME Conditions below could be process unit parameters + const LimaString& tokenStr=token->stringForm(); + if (token->status().getAlphaCapital() == T_CAPITAL + || token->status().getAlphaCapital() == T_CAPITAL_1ST + || token->status().getAlphaCapital() == T_CAPITAL_SMALL + || token->status().isAlphaConcatAbbrev() + || token->status().isAlphaHyphen() + || token->status().isAlphaPossessive() + || tokenStr.toUpper() == tokenStr) + { + return; + } + char **suggestions; + int suggestResult = m_hunspell->suggest(&suggestions, tokenStr.toUtf8().constData()); + for (int i = 0; i < suggestResult; i++) + { + LimaString correction = LimaString::fromUtf8(suggestions[i]); + // FIXME Conditions below could be process unit parameters + if ( correction.size() > 1 && correction != tokenStr ) + { + LDEBUG << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives trying to correct" << tokenStr << "into" << correction; + DictionaryEntry entry (m_dictionary->getEntry(correction)); + MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); + + +// if (!entry.isEmpty()) + { + LINFO << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correcting" << tokenStr << "into" << correction; + // add orthographic alternative to Token; + StringsPoolIndex idx=sp[correction]; + token->addOrthographicAlternatives(idx); + + if (entry.hasLingInfos()) + { + entry.parseLingInfos(&lingInfosHandler); + } + } +// else +// { +// LDEBUG << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correction" << correction << "not found in the dictionary"; +// delete entry; +// } + } + } + m_hunspell->free_list(&suggestions, suggestResult); +} + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h new file mode 100644 index 000000000..b5f085634 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h @@ -0,0 +1,72 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#ifndef HUNSPELL_SPELLING_ALTERNATIVES_H +#define HUNSPELL_SPELLING_ALTERNATIVES_H + +#include "common/Data/LimaString.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace FlatTokenizer +{ + class CharChart; +} +namespace LinguisticAnalysisStructure +{ + class MorphoSyntacticData; +} +namespace AnalysisDict +{ + class AbstractAnalysisDictionary; +} +namespace MorphologicAnalysis +{ + +#define HUNSPELL_SPELLING_ALTERNATIVES_CLASSID "HunspellSpellingAlternatives" +class HunspellSpellingAlternativesPrivate; +class HunspellSpellingAlternatives : public MediaProcessUnit { + +public: + HunspellSpellingAlternatives(); + virtual ~HunspellSpellingAlternatives(); + + void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager); + + LimaStatusCode process( + AnalysisContent& analysis) const; + +private: + HunspellSpellingAlternativesPrivate* m_d; + +}; + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima + + +#endif // HUNSPELL_SPELLING_ALTERNATIVES_H From 3f5ba1d35350e7444e0cbcbdc108df73a27cdd9e Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Fri, 27 Nov 2015 16:01:44 +0100 Subject: [PATCH 09/82] Finish integration of hunspell --- lima_linguisticprocessing/CMakeLists.txt | 12 +++++- .../conf/lima-lp-fre.xml | 16 ++++++++ .../AlternativesReader.cpp | 2 +- .../core/MorphologicAnalysis/CMakeLists.txt | 7 ++++ .../MorphologicAnalysis/DefaultProperties.cpp | 12 ++++-- .../EnchantSpellingAlternatives.cpp | 37 ++++++++++++++----- 6 files changed, 70 insertions(+), 16 deletions(-) diff --git a/lima_linguisticprocessing/CMakeLists.txt b/lima_linguisticprocessing/CMakeLists.txt index 35902d8ec..4a7487972 100644 --- a/lima_linguisticprocessing/CMakeLists.txt +++ b/lima_linguisticprocessing/CMakeLists.txt @@ -139,12 +139,22 @@ endif() find_package (Enchant) if (ENCHANT_FOUND) + message(STATUS "Found Enchant. Enchant spelling correction will be built.") set(optionalLibs ${optionalLibs} ${ENCHANT_LIBRARIES}) include_directories(${ENCHANT_INCLUDE_DIRS}) else (ENCHANT_FOUND) - message(STATUS "Could not find Enchant ${ENCHANT_INCLUDE_DIRS}. Spelling correction will not be built.") + message(STATUS "Could not find Enchant ${ENCHANT_INCLUDE_DIRS}. Enchant spelling correction will not be built.") endif (ENCHANT_FOUND) +find_package (HUNSPELL) +if (HUNSPELL_FOUND) + message(STATUS "Found Hunspell. Hunspell spelling correction will be built.") + set(optionalLibs ${optionalLibs} ${HUNSPELL_LIBRARIES}) + include_directories(${HUNSPELL_INCLUDE_DIR}) +else (HUNSPELL_FOUND) + message(STATUS "Could not find Hunspell ${HUNSPELL_INCLUDE_DIRS}. Hunspell spelling correction will not be built.") +endif (HUNSPELL_FOUND) + # QHttpServer is necessary for limaserver HTTP server find_package(QHttpServer QUIET) if (NOT QHTTPSERVER_FOUND) diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index f037fccbb..3716fb6c4 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -18,6 +18,12 @@ + + + + @@ -318,6 +324,16 @@ + + + + + + + + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp index 287fb1a16..a2c7660da 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp @@ -159,7 +159,6 @@ void AlternativesReader::readAlternatives( #ifdef DEBUG_LP LDEBUG << "-> StringPool returned index " << idx; #endif - token.addOrthographicAlternatives(idx); DictionaryEntry entry=dico.getEntry(idx,unmarked); #ifdef DEBUG_LP LDEBUG << "entry.isEmpty:" << entry.isEmpty(); @@ -167,6 +166,7 @@ void AlternativesReader::readAlternatives( if (!entry.isEmpty()) { + token.addOrthographicAlternatives(idx); #ifdef DEBUG_LP LDEBUG << "confident mode: " << m_confidentMode; LDEBUG << "lingInfosHandler: " << (void*)accentedHandler << " entry.hasLingInfos:" << entry.hasLingInfos(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt index ede2e7344..bc566d198 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt @@ -43,6 +43,13 @@ SET(lima-lp-morphologicanalysis_LIB_SRCS ) endif (ENCHANT_FOUND) +if (HUNSPELL_FOUND) +SET(lima-lp-morphologicanalysis_LIB_SRCS + ${lima-lp-morphologicanalysis_LIB_SRCS} + HunspellSpellingAlternatives.cpp +) +endif () + DECLARE_LIMA_PLUGIN(lima-lp-morphologicanalysis) target_link_libraries(lima-lp-morphologicanalysis diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp index 10571135b..caa3795c7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp @@ -147,11 +147,15 @@ LimaStatusCode DefaultProperties::process( if (it!=m_defaults.end()) { LinguisticElement elem; elem.inflectedForm=currentToken->form(); - LimaString str=currentToken->stringForm(); - if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()){ - str = m_charChart->unmark(currentToken->stringForm()); + if (!currentToken->orthographicAlternatives().empty()) + { + elem.lemma = *(currentToken->orthographicAlternatives().begin()); + } + else if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()) + { + LimaString str = m_charChart->toLower(currentToken->stringForm()); + elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[str]; } - elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[str]; elem.normalizedForm=elem.lemma; elem.type=UNKNOWN_WORD; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp index 44f213bef..393bb8d1a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp @@ -82,14 +82,29 @@ void EnchantSpellingAlternatives::init( Manager* manager) { MORPHOLOGINIT; + LDEBUG << "EnchantSpellingAlternatives::init"; m_d->m_language = manager->getInitializationParameters().media; try { - m_d->m_enchantDictionary = enchant::Broker::instance()->request_dict(Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language).substr(0,2)); + // By default, the spellchecking dictionary is the system one for the current language + std::string spellDico = Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language).substr(0,2); + try + { + // try to get a specific spellchecking dictionary name from the config file + spellDico = unitConfiguration.getParamsValueAtKey("spellcheckDictionary"); + } + catch (NoSuchParam& ) + { + } +// LDEBUG << "EnchantSpellingAlternatives::init requesting Enchant spellcheck dictionary" << Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/" << spellDico; +// enchant::Broker::instance()->set_param("enchant.myspell.dictionary.path",Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"); + LDEBUG << "EnchantSpellingAlternatives::init requesting Enchant spellcheck dictionary" << spellDico; + m_d->m_enchantDictionary = enchant::Broker::instance()->request_dict(spellDico); } catch (enchant::Exception& e) { - LERROR << "Cannot get Enchant dictionary for language" << Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language); + MORPHOLOGINIT; + LERROR << "Cannot get Enchant dictionary for language" << Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language)<< ":" << e.what(); throw LimaException(); } try @@ -165,26 +180,28 @@ void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives( // FIXME Conditions below could be process unit parameters if ( correction.size() > 1 && correction != tokenStr ) { - DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction)); + LDEBUG << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives trying to correct" << tokenStr << "into" << correction; + DictionaryEntry entry (m_dictionary->getEntry(correction)); MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); - if (!entry->isEmpty()) +// if (!entry.isEmpty()) { LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction; // add orthographic alternative to Token; StringsPoolIndex idx=sp[correction]; token->addOrthographicAlternatives(idx); - if (entry->hasLingInfos()) + if (entry.hasLingInfos()) { - entry->parseLingInfos(&lingInfosHandler); + entry.parseLingInfos(&lingInfosHandler); } } - else - { - delete entry; - } +// else +// { +// LDEBUG << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correction" << correction << "not found in the dictionary"; +// delete entry; +// } } } } From 160e7ff07f9324af33a548c35ccb880886d95e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 2 Dec 2015 09:59:46 +0100 Subject: [PATCH 10/82] Correct a microcategory in a Modex --- lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml index 810c6a05b..289adff19 100644 --- a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml +++ b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml @@ -105,7 +105,7 @@ - + From 30cb5b612e1f752abb9b9af4f84ade7e4f22d947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 2 Dec 2015 10:01:31 +0100 Subject: [PATCH 11/82] Add missing words in French dictionary --- .../analysisDictionary/fre/lefff/lefff-ext-1.txt | 2 ++ .../analysisDictionary/fre/lefff/lefff-ext-2.txt | 1 + .../analysisDictionary/fre/lefff/lefff-ext-3.txt | 4 ++++ .../analysisDictionary/fre/lefff/lefff-ext-lima.dic | 7 +++++++ 4 files changed, 14 insertions(+) diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt index b67191979..5ecac0c46 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt @@ -225903,6 +225903,8 @@ chiliens 100 nc [pred="chilien_____1",cat chilio-_ 100 adjPref [pred="chilio-______1",cat=adv] chilio-______1 Default %default pref chimie 100 nc [pred="chimie_____1",cat=nc,@fs] chimie_____1 Default fs %default nc-2f chimies 100 nc [pred="chimie_____1",cat=nc,@fp] chimie_____1 Default fp %default nc-2f +chimio 100 nc [pred="chimiothérapie_____1",cat=nc,@fs] chimiothérapie_____1 Default fs %default nc-2f +chimios 100 nc [pred="chimiothérapie_____1",cat=nc,@fp] chimiothérapie_____1 Default fp %default nc-2f chimiothérapie 100 nc [pred="chimiothérapie_____1",cat=nc,@fs] chimiothérapie_____1 Default fs %default nc-2f chimiothérapies 100 nc [pred="chimiothérapie_____1",cat=nc,@fp] chimiothérapie_____1 Default fp %default nc-2f chimique 100 adj [pred="chimique_____1",@pers,cat=adj,@s] chimique_____1 Default s %adj_personnel adj-ique2 diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt index ba2ea6eef..7b6751651 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt @@ -35086,6 +35086,7 @@ flashé 100 v [pred="flasher_____1",@active flashée 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kfs] flasher_____1 PastParticiple Kfs %actif v-er:std flashées 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kfp] flasher_____1 PastParticiple Kfp %actif v-er:std flashés 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kmp] flasher_____1 PastParticiple Kmp %actif v-er:std +flashy 100 adj [pred="flashy_____1",@pers,cat=adj] flashy_____1 Default %adj_personnel adj-1 flasque 100 adj [pred="flasque_____1",@pers,cat=adj,@s] flasque_____1 Default s %adj_personnel adj-2 flasque 100 nc [pred="flasque_____1",cat=nc,@s] flasque_____1 Default s %default nc-2 flasquement 100 advm [pred="flasquement_____1",clivee=+,cat=adv] flasquement_____1 Default %default adv diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt index 33ce70bce..18e336af9 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt @@ -24681,6 +24681,8 @@ smaltines 100 nc [pred="smaltine_____1",c smalts 100 nc [pred="smalt_____1",cat=nc,@mp] smalt_____1 Default mp %default nc-2m smaragdite 100 nc [pred="smaragdite_____1",cat=nc,@fs] smaragdite_____1 Default fs %default nc-2f smaragdites 100 nc [pred="smaragdite_____1",cat=nc,@fp] smaragdite_____1 Default fp %default nc-2f +smartphone 100 nc [pred="smartphone_____1",cat=nc,@ms] smartphone_____1 Default ms %default nc-2m +smartphones 100 nc [pred="smartphone_____1",cat=nc,@mp] smartphone_____1 Default mp %default nc-2m smash 100 nc [pred="smash_____1",cat=nc,semtype=event|-,@ms] smash_____1 Default ms %default 0 smasha 100 v [pred="smasher_____1",@pers,cat=v,@J3s] smasher_____1 ThirdSing J3s %actif v-er:std smashai 100 v [pred="smasher_____1",@pers,cat=v,@J1s] smasher_____1 Default J1s %actif v-er:std @@ -42805,6 +42807,8 @@ stimula 100 v [pred="stimuler_____1",@pers, stimula 100 v [pred="stimuler_____1",@pers,cat=v,@J3s] stimuler_____1 ThirdSing J3s %actif v-er:std stimula 100 v [pred="stimuler_____1se",@pers,@se_moyen,@être,cat=v,@J3s] stimuler_____1 ThirdSing J3s %se_moyen v-er:std stimula 100 v [pred="stimuler_____2",@pers,cat=v,@J3s] stimuler_____2 ThirdSing J3s %actif v-er:std +stimulable 100 adj [pred="stimulable_____1",@pers,cat=adj,@s] stimulable_____1 Default s %adj_personnel adj-2 +stimulables 100 adj [pred="stimulable_____1",@pers,cat=adj,@p] stimulable_____1 Default p %adj_personnel adj-2 stimulai 100 v [pred="stimuler_____1",@pers,cat=v,@J1s] stimuler_____1 Default J1s %actif v-er:std stimulai 100 v [pred="stimuler_____1",@pers,cat=v,@J1s] stimuler_____1 Default J1s %actif v-er:std stimulai 100 v [pred="stimuler_____1se",@pers,@se_moyen,@être,cat=v,@J1s] stimuler_____1 Default J1s %se_moyen v-er:std diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic index c7899c8d4..853bc4762 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic @@ -140181,6 +140181,8 @@ chiliens chilien Ncgmp- chilio-_ chilio-_1 Ep chimie chimie Ncgfs- chimies chimie Ncgfp- +chimio chimiothérapie Ncgfs- +chimios chimiothérapie Ncgfp- chimiothérapie chimiothérapie Ncgfs- chimiothérapies chimiothérapie Ncgfp- chimique chimique Afha--- @@ -548902,6 +548904,8 @@ stimugène stimugène Ncgms- stimugènes stimugène Ncgmp- stimula stimuler Vpisi3-s stimula stimuler Vpisp3-s +stimulable stimulable Afha--- +stimulables stimulable Afha--- stimulai stimuler Vpisi1-s stimulai stimuler Vpisp1-s stimulaient stimuler Vpiii3-p @@ -629835,3 +629839,6 @@ trois trois ADJNUM vingt vingt ADJNUM un un ADJNUM FOREIGN FOREIGN Ee +smartphone smartphone Ncgms- +smartphones smartphone Ncgmp- +flashy flashy Afha--- From 55fb4b57ce2f90887d59d72621b1b472cf7d9335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 2 Dec 2015 10:02:28 +0100 Subject: [PATCH 12/82] =?UTF-8?q?Handle=20an=20idiom=20with=20=E2=80=99=20?= =?UTF-8?q?instead=20of=20'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt b/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt index 7b0f2dcca..5014b5000 100644 --- a/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt +++ b/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt @@ -3024,6 +3024,9 @@ ID;i;A;Quant;Quant à;préposition;quant à ID;i;A;Quant;Quant au;préposition article;quant à ID;i;A;Quant;Quant aux;préposition article pluriel;quant à ID;i;A;[D]aujourd';[D]aujourd' hui;adverbe;aujourd'hui +GC;i;A;[D]Aujourd';[D]Aujourd' hui;adverbe;aujourd'hui +GC;i;A;[D]aujourd’;[D]aujourd’ hui;adverbe;aujourd'hui +GC;i;A;[D]Aujourd’;[D]Aujourd’ hui;adverbe;aujourd'hui ID;i;A;extenso;in extenso;adverbe; ID;i;A;extremis;in extremis;adverbe; ID;i;A;facto;de facto;adverbe; From fc064bc48652b3d2be599dcca09dc785f893c623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 2 Dec 2015 10:03:54 +0100 Subject: [PATCH 13/82] Change process unit order idiomatic alternatives before spellchecking to correct handling of "aujourd'hui". The problem is that spelling errors inside idioms will prevent them to be detected. --- lima_linguisticprocessing/conf/lima-lp-fre.xml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index 3716fb6c4..1b56c6d8f 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -18,13 +18,13 @@ + - @@ -163,8 +163,10 @@ - - + + + + @@ -333,6 +335,8 @@ + + From c77d356c381c2c3ba8fc5d6393f7ac4358e8e607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 2 Dec 2015 10:08:05 +0100 Subject: [PATCH 14/82] Spellchecking improvements New option to use only the best correction instead of all the possible ones. Spellcheck only tokens reachable from start to avoid correcting tokens already excluded, e.g. by idioms Do not spellcheck tokens with t_url default key. Add position of token in info output --- .../HunspellSpellingAlternatives.cpp | 85 +++++++++++++++---- 1 file changed, 70 insertions(+), 15 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp index ceceb312e..921b93ff1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp @@ -31,6 +31,7 @@ #include "linguisticProcessing/core/AnalysisDict/AbstractAnalysisDictionary.h" #include +#include using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; using namespace Lima::LinguisticProcessing::AnalysisDict; @@ -52,7 +53,7 @@ class HunspellSpellingAlternativesPrivate friend class HunspellSpellingAlternatives; public: - HunspellSpellingAlternativesPrivate() : m_hunspell(0) {} + HunspellSpellingAlternativesPrivate() : m_hunspell(0), m_bestOnly(false) {} virtual ~HunspellSpellingAlternativesPrivate() {delete m_hunspell;} @@ -64,6 +65,7 @@ class HunspellSpellingAlternativesPrivate AnalysisDict::AbstractAnalysisDictionary* m_dictionary; MediaId m_language; Hunspell* m_hunspell; + bool m_bestOnly; }; @@ -110,6 +112,15 @@ void HunspellSpellingAlternatives::init( LERROR << "no param 'dictionary' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language; throw InvalidConfiguration(); } + try + { + // try to get a specific spellchecking dictionary name from the config file + m_d->m_bestOnly = unitConfiguration.getBooleanParameter("bestOnly"); + } + catch (NoSuchParam& ) + { + LNOTICE << "no param 'bestOnly' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language << ". Use default"; + } } @@ -124,24 +135,63 @@ LimaStatusCode HunspellSpellingAlternatives::process(AnalysisContent& analysis) LinguisticGraph* g=tokenList->getGraph(); VertexDataPropertyMap dataMap=get(vertex_data,*g); VertexTokenPropertyMap tokenMap=get(vertex_token,*g); - LinguisticGraphVertexIt it,itEnd; - for (boost::tie(it,itEnd)=vertices(*g) ; it != itEnd ; it++) + + LinguisticGraphVertex firstVx = tokenList->firstVertex(); + LinguisticGraphVertex lastVx = tokenList->lastVertex(); + + std::set< std::string > alreadyStored; + std::set visited; + //std::set alreadyStoredVertices; compatibilite 32 64 bits + std::set alreadyStoredVertices; + + std::queue toVisit; + toVisit.push(firstVx); + + while (!toVisit.empty()) { - LDEBUG << "HunspellSpellingAlternatives::process processing vertex " << *it; - Token* currentToken=tokenMap[*it]; - MorphoSyntacticData* msd=dataMap[*it]; - - if (currentToken!=0) + LinguisticGraphVertex v=toVisit.front(); +#ifdef DEBUG_LP + LDEBUG << "BowDumper::addVerticesToBoWText visiting" << v; +#endif + + toVisit.pop(); + if (v == lastVx) { + continue; + } + + LinguisticGraphOutEdgeIt outItr,outItrEnd; + for (boost::tie(outItr,outItrEnd)=out_edges(v,*g); + outItr!=outItrEnd; + outItr++) { - if (msd->empty()) + LinguisticGraphVertex next=target(*outItr,*g); + if (visited.find(next)==visited.end()) { - m_d->setHunspellSpellingAlternatives( - currentToken, - msd, - sp); + visited.insert(next); + toVisit.push(next); + } + } + + if (v != firstVx && v != lastVx) + { + LDEBUG << "HunspellSpellingAlternatives::process processing vertex " << v; + Token* currentToken=tokenMap[v]; + MorphoSyntacticData* msd=dataMap[v]; + + if (currentToken!=0) + { + if (msd->empty()) + { + m_d->setHunspellSpellingAlternatives( + currentToken, + msd, + sp); + } } } } + + LINFO << "MorphologicalAnalysis: ending process HunspellSpellingAlternatives"; return SUCCESS_ID; } @@ -161,12 +211,17 @@ void HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives( || token->status().isAlphaConcatAbbrev() || token->status().isAlphaHyphen() || token->status().isAlphaPossessive() - || tokenStr.toUpper() == tokenStr) + || tokenStr.toUpper() == tokenStr + || token->status().defaultKey() == "t_url") { return; } char **suggestions; int suggestResult = m_hunspell->suggest(&suggestions, tokenStr.toUtf8().constData()); + if (suggestResult > 1 && m_bestOnly) + { + suggestResult = 1; + } for (int i = 0; i < suggestResult; i++) { LimaString correction = LimaString::fromUtf8(suggestions[i]); @@ -180,7 +235,7 @@ void HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives( // if (!entry.isEmpty()) { - LINFO << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correcting" << tokenStr << "into" << correction; + LINFO << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correcting" << tokenStr << "into" << correction << "at" << token->position(); // add orthographic alternative to Token; StringsPoolIndex idx=sp[correction]; token->addOrthographicAlternatives(idx); From a6170c228526863e39b383a1a40791835b8c61d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Mon, 7 Dec 2015 14:50:48 +0100 Subject: [PATCH 15/82] Add conll mapping for fre --- .../CMakeLists.txt | 8 +++- .../mapping_conll_lima_fre.txt | 40 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt diff --git a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt index e3ef41cb5..58f8e8643 100644 --- a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt +++ b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt @@ -1 +1,7 @@ -install(FILES mapping_conll_Lima.txt COMPONENT common DESTINATION share/apps/lima/resources/SRLIntegration/lima_conll_dependency_tag_mapping) +install( +FILES + mapping_conll_Lima.txt + mapping_conll_lima_fre.txt +COMPONENT + common +DESTINATION share/apps/lima/resources/SRLIntegration/lima_conll_dependency_tag_mapping) diff --git a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt new file mode 100644 index 000000000..0ffd6ec5a --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt @@ -0,0 +1,40 @@ +acl acl +ADVADJ advmod +ADVADV advmod +AdvVerbe advmod +ADJPRENSUB amod +ATB_S cop +aux aux +auxpass auxpass +COD_V dobj +CodPrev dobj +COORD1 cc +COORD2 cc +COMPDUNOM nmod +COMPL ccomp +CPLV_V advmod +CPL_V iobj +DetAdj det +DetIntSub det +det det +DETSUB det +Dummy dep +Neg neg +MOD_V iobj +PrepDetInt case +PrepInf mark +PronSujVerbe nsubj +SUBADJPOST amod +SujInv nsubj +SUJ_V nsubj +PREPSUB case +MOD_A amod +MOD_N amod +SUBSUBJUX compound +APPOS appos +COMPADJ amod +COMPADV advmod +Pleon nsubj +PrepPartPres case +PrepPronRel case +PronReflVerbe expl From e002f78fa061ffb2f0f11cb0d5d9037af36a8a16 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Sat, 5 Dec 2015 10:20:09 +0100 Subject: [PATCH 16/82] Allow to use CONLL dumper without a mapping file --- .../core/AnalysisDumpers/ConllDumper.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp index d249547e4..4b2a62035 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp @@ -194,8 +194,8 @@ void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructur } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - LERROR << "no parameter 'mappingFile' in ConllDumper group" << " !"; - throw InvalidConfiguration(); + LINFO << "no parameter 'mappingFile' in ConllDumper group" << " !"; +// throw InvalidConfiguration(); } } @@ -386,7 +386,8 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const } else { - LERROR << "ConllDumper::process" << relName << "not found in mapping"; + conllRelName= relName; +// LERROR << "ConllDumper::process" << relName << "not found in mapping"; } } QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "-"; From 9bd10e2a58de712421f78fc69e4336216fc1feb4 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Sat, 5 Dec 2015 10:21:33 +0100 Subject: [PATCH 17/82] Set the CONLL dumper to be the default one --- lima_linguisticprocessing/conf/lima-lp-eng.xml | 11 ++++++----- lima_linguisticprocessing/conf/lima-lp-fre.xml | 7 ++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-eng.xml b/lima_linguisticprocessing/conf/lima-lp-eng.xml index f3c59a769..1ff773587 100644 --- a/lima_linguisticprocessing/conf/lima-lp-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-eng.xml @@ -54,7 +54,7 @@ - + @@ -711,10 +711,6 @@ - - - - @@ -784,6 +780,11 @@ + + + + + diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index 1b56c6d8f..543ac5d0f 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -63,7 +63,8 @@ - + + @@ -792,6 +793,10 @@ + + + + From 92ef118ced5f45eb591654ec4dcbdcbc76fff5b1 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Sat, 5 Dec 2015 10:49:06 +0100 Subject: [PATCH 18/82] Correct CONLL dumper Now follow the CONLL-X syntaxt described here: http://ilk.uvt.nl/conll/#dataformat --- .../core/AnalysisDumpers/ConllDumper.cpp | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp index 4b2a62035..a987a3580 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp @@ -359,8 +359,9 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const LDEBUG << "ConllDumper::process PosGraph token" << v; if( morphoData!=0 && !morphoData->empty() && ft != 0) { - const QString graphTag=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); - LDEBUG << "ConllDumper::process graphTag:" << graphTag; + const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); + const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); + LDEBUG << "ConllDumper::process graphTag:" << micro; std::string inflectedToken=ft->stringForm().toUtf8().constData(); std::string lemmatizedToken; @@ -390,11 +391,30 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const // LERROR << "ConllDumper::process" << relName << "not found in mapping"; } } - QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "-"; - dstream->out() << tokenId << "\t"<< inflectedToken << "\t" - << lemmatizedToken << "\t" << graphTag << "\t" - << graphTag << "\t" << "-" << "\t" << targetConllIdString << "\t" - << conllRelName.toUtf8().constData() << "\t-\t-"; + // CONLL-X format + // http://ilk.uvt.nl/conll/#dataformat + // 1 ID Token counter, starting at 1 for each new sentence. + // 2 FORM Word form or punctuation symbol. + // 3 LEMMA Lemma or stem (depending on particular data set) of word form, or an underscore if not available. + // 4 CPOSTAG Coarse-grained part-of-speech tag, where tagset depends on the language. + // 5 POSTAG Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. + // 6 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. + // 7 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + // 8 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 9 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). + // 10 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + + QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "_"; + dstream->out() << tokenId + << "\t" << inflectedToken + << "\t" << lemmatizedToken + << "\t" << macro + << "\t" << micro + << "\t" << "_" + << "\t" << targetConllIdString + << "\t" << conllRelName.toUtf8().constData() + << "\t" << "_" + << "\t" << "_"; if (!predicates.isEmpty()) { dstream->out() << "\t"; From b227d31ae314a4a8ea5018859985c64d42c446ec Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Sat, 5 Dec 2015 13:41:44 +0100 Subject: [PATCH 19/82] Add a named entity type column to the CONLL output --- .../core/AnalysisDumpers/ConllDumper.cpp | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp index a987a3580..3170c8b3a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp @@ -361,7 +361,7 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const { const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); - LDEBUG << "ConllDumper::process graphTag:" << micro; + LDEBUG << "ConllDumper::process graphTag:" << micro; std::string inflectedToken=ft->stringForm().toUtf8().constData(); std::string lemmatizedToken; @@ -370,7 +370,29 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const lemmatizedToken=sp[(*morphoData)[0].lemma].toUtf8().constData(); } - QString conllRelName = "-"; + QString neType = QString::fromUtf8("_") ; + std::set< AnnotationGraphVertex > anaVertices = annotationData->matches("PosGraph",v,"AnalysisGraph"); + // note: anaVertices size should be 0 or 1 + for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); + anaVerticesIt != anaVertices.end(); anaVerticesIt++) + { + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + it != matches.end(); it++) + { + AnnotationGraphVertex vx=*it; + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + const SpecificEntityAnnotation* se = + annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + neType = Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + break; + } + } + if (neType != "_") break; + } + QString conllRelName = "_"; int targetConllId = 0; if (vertexDependencyInformations.count(v)!=0) { @@ -391,27 +413,29 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const // LERROR << "ConllDumper::process" << relName << "not found in mapping"; } } - // CONLL-X format + // Modified CONLL-X format with an extra named entity type column // http://ilk.uvt.nl/conll/#dataformat // 1 ID Token counter, starting at 1 for each new sentence. // 2 FORM Word form or punctuation symbol. // 3 LEMMA Lemma or stem (depending on particular data set) of word form, or an underscore if not available. // 4 CPOSTAG Coarse-grained part-of-speech tag, where tagset depends on the language. // 5 POSTAG Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. - // 6 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. - // 7 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. - // 8 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. - // 9 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). - // 10 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 6 NER Extra column: Named entity type + // 7 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. + // 8 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + // 9 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 10 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). + // 11 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "_"; dstream->out() << tokenId << "\t" << inflectedToken << "\t" << lemmatizedToken - << "\t" << macro - << "\t" << micro + << "\t" << macro.toUtf8().constData() + << "\t" << micro.toUtf8().constData() + << "\t" << neType.toUtf8().constData() << "\t" << "_" - << "\t" << targetConllIdString + << "\t" << targetConllIdString.toUtf8().constData() << "\t" << conllRelName.toUtf8().constData() << "\t" << "_" << "\t" << "_"; From a4ae914ad5d7f3aa20eb74b3ceaa30a35cd7b031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 9 Dec 2015 15:14:20 +0100 Subject: [PATCH 20/82] Add explicit constructor for a structure to avoid a crash --- .../MorphoSyntacticData.cpp | 27 +++++++++++++++++++ .../MorphoSyntacticData.h | 4 +++ 2 files changed, 31 insertions(+) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp index 8d4243a71..b438d2bde 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp @@ -40,6 +40,33 @@ namespace LinguisticProcessing namespace LinguisticAnalysisStructure { +LinguisticElement::LinguisticElement() : + inflectedForm(0), + lemma(0), + normalizedForm(0), + properties(0), + type(NO_MORPHOSYNTACTICTYPE) + +{ +} +LinguisticElement::LinguisticElement(const LinguisticElement& le) : + inflectedForm(le.inflectedForm), + lemma(le.lemma), + normalizedForm(le.normalizedForm), + properties(le.properties), + type(le.type) +{ +} +LinguisticElement& LinguisticElement::operator=(const LinguisticElement& le) +{ + inflectedForm = le.inflectedForm; + lemma = le.lemma; + normalizedForm = le.normalizedForm; + properties = le.properties; + type = le.type; + return *this; +} + bool LinguisticElement::operator==(const LinguisticElement& le) const { return ((properties==le.properties) && diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h index 745b97737..2c9aba2d0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h @@ -57,6 +57,10 @@ enum MorphoSyntacticType { }; struct LIMA_LINGUISTICANALYSISSTRUCTURE_EXPORT LinguisticElement { + LinguisticElement(); + LinguisticElement(const LinguisticElement& le); + LinguisticElement& operator=(const LinguisticElement& le); + StringsPoolIndex inflectedForm; StringsPoolIndex lemma; StringsPoolIndex normalizedForm; From 5f97af54ab94deee2d9c927cf313faf676b886fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 9 Dec 2015 15:18:38 +0100 Subject: [PATCH 21/82] Give default categories to URLs --- .../analysisDictionary/fre/convert/default-fre.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt b/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt index bcb0316c6..fc5740bf6 100644 --- a/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt +++ b/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt @@ -253,3 +253,8 @@ t_ordinal_integer Aoz--mp t_ordinal_integer Aoz--fp t_pattern Ea t_fallback Ea +t_url Npgms- +t_url Npgfs- +t_url Npgmp- +t_url Npgfp- +t_url Ee \ No newline at end of file From 9ab306998c134d110f213c7a159dc460c073eee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Wed, 9 Dec 2015 15:34:22 +0100 Subject: [PATCH 22/82] Syntactic sugar --- .../MorphologicAnalysis/DefaultProperties.cpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp index caa3795c7..5619d3e28 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp @@ -118,7 +118,6 @@ LimaStatusCode DefaultProperties::process( AnalysisContent& analysis) const { Lima::TimeUtilsController timer("DefaultProperties"); - MORPHOLOGINIT; AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); @@ -143,33 +142,37 @@ LimaStatusCode DefaultProperties::process( // orthographic alternatives, default properties are not applied> if (currentData->empty()) { - std::map >::const_iterator it=m_defaults.find(currentToken->status().defaultKey()); - if (it!=m_defaults.end()) { + auto it = m_defaults.find(currentToken->status().defaultKey()); + if (it!=m_defaults.end()) + { LinguisticElement elem; - elem.inflectedForm=currentToken->form(); + elem.inflectedForm = currentToken->form(); if (!currentToken->orthographicAlternatives().empty()) { elem.lemma = *(currentToken->orthographicAlternatives().begin()); } else if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()) { - LimaString str = m_charChart->toLower(currentToken->stringForm()); + LimaString str; +// elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[currentToken->stringForm()]; +// LimaString str = m_charChart->toLower(currentToken->stringForm()); elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[str]; } elem.normalizedForm=elem.lemma; elem.type=UNKNOWN_WORD; - for (std::vector::const_iterator codeItr=it->second.begin(); - codeItr!=it->second.end(); - codeItr++) + for (auto codeItr=it->second.begin(); codeItr!=it->second.end();codeItr++) { elem.properties=*codeItr; currentData->push_back(elem); } - } else { + } + else + { + MORPHOLOGINIT; LWARN << "No default property for " - << Common::Misc::limastring2utf8stdstring(currentToken->stringForm()) << ". Status : " - << Common::Misc::limastring2utf8stdstring(currentToken->status().defaultKey()); + << currentToken->stringForm() << ". Status : " + << currentToken->status().defaultKey(); } } } From d3e72d7d151c9efef9e6451df4f395ad515e7ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Tue, 9 Feb 2016 13:55:02 +0100 Subject: [PATCH 23/82] Protect LDEBUG with ifdef --- .../core/AnalysisDumpers/SimpleXmlDumper.cpp | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp index 13de6bc7a..3615aa3fc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp @@ -167,8 +167,9 @@ xmlOutput(std::ostream& out, AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData) const { +#ifdef DEBUG_LP DUMPERLOGINIT; - +#endif out << "" << endl; LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); @@ -179,6 +180,7 @@ xmlOutput(std::ostream& out, if (sb==0) { + DUMPERLOGINIT; LWARN << "no SentenceBoundaries"; } @@ -200,7 +202,9 @@ xmlOutput(std::ostream& out, { // ??OME2 uint64_t nbSentences(sb->size()); uint64_t nbSentences((sb->getSegments()).size()); +#ifdef DEBUG_LP LDEBUG << "SimpleXmlDumper: "<< nbSentences << " sentences found"; +#endif for (uint64_t i=0; igetStartOffset()); string str=oss.str(); if (str.empty()) { +#ifdef DEBUG_LP LDEBUG << "nothing to dump in this sentence"; +#endif } else { out << "" << endl @@ -249,10 +256,10 @@ xmlOutputVertices(std::ostream& out, const uint64_t offset) const { +#ifdef DEBUG_LP DUMPERLOGINIT; - LDEBUG << "SimpleXmlDumper: ========================================"; - LDEBUG << "SimpleXmlDumper: outputXml from vertex " << begin << " to vertex " << end; - + LDEBUG << "SimpleXmlDumper::xmlOutputVertices from vertex " << begin << " to vertex " << end; +#endif LinguisticGraph* graph=posgraph->getGraph(); LinguisticGraphVertex lastVertex=posgraph->lastVertex(); @@ -333,6 +340,10 @@ xmlOutputVertex(std::ostream& out, const FsaStringsPool& sp, uint64_t offset) const { +#ifdef DEBUG_LP + DUMPERLOGINIT; + LDEBUG << "SimpleXmlDumper::xmlOutputVertex" << v; +#endif MorphoSyntacticData* data=get(vertex_data,*(posgraph->getGraph()),v); // first, check if vertex corresponds to a specific entity found before pos tagging (i.e. in analysis graph) @@ -341,16 +352,25 @@ xmlOutputVertex(std::ostream& out, for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); anaVerticesIt != anaVertices.end(); anaVerticesIt++) { +#ifdef DEBUG_LP + LDEBUG << "SimpleXmlDumper::xmlOutputVertex AnalysisGraph vertex for" << v << "is" << *anaVerticesIt; +#endif std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) { AnnotationGraphVertex vx=*it; +#ifdef DEBUG_LP + LDEBUG << "SimpleXmlDumper::xmlOutputVertex vertex" << v << "," << *anaVerticesIt << "has annot vertex" << vx; +#endif if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { const SpecificEntityAnnotation* se = annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). pointerValue(); +#ifdef DEBUG_LP + LDEBUG << "SimpleXmlDumper::xmlOutputVertex annot vertex" << vx << "has SpecificEntity annotation"; +#endif if (outputSpecificEntity(out,se,data,anagraph->getGraph(),sp,offset)) { return; } @@ -485,9 +505,10 @@ outputSpecificEntity(std::ostream& out, // take as category for parts the category for the named entity LinguisticCode category=m_propertyAccessor->readValue(data->begin()->properties); +#ifdef DEBUG_LP DUMPERLOGINIT; LDEBUG << "Using category " << m_propertyManager->getPropertySymbolicValue(category) << " for specific entity of type " << typeName; - +#endif // get the parts of the named entity match // use the category of the named entity for all elements for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); @@ -512,7 +533,7 @@ std::string SimpleXmlDumper::xmlString(const std::string& inputStr) const replace(str,"<", "<"); replace(str,">", ">"); replace(str,"\"", """); - replace(str,"\n", "\n"); + replace(str,"\n", ""); return str; } From d1ccb821dd9ee5b528d9b8c4542e74c044a39782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Tue, 9 Feb 2016 13:55:35 +0100 Subject: [PATCH 24/82] Allow splitting on t_sentence_break tokenization status --- .../AnalysisGraph.cpp | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.cpp index 9216fbcc3..028ca1b99 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.cpp @@ -205,11 +205,15 @@ LinguisticGraphVertex AnalysisGraph::nextMainPathVertex( const std::list microFilters, LinguisticGraphVertex end) { +#ifdef DEBUG_LP + LASLOGINIT; +#endif /* * Algorithm: we're using a Breadth First Search and keep track of the * "thickness" of the lattice, and only stop if both condition apply: * 1/ the thickness is 1, meaning that every path goes through this node * 2/ the node is in microFilters (eg. a full stop in english) + * OR the node has t_sentence_break tokenization status */ std::set visited; LinguisticGraphOutEdgeIt outItr,outItrEnd; @@ -225,6 +229,7 @@ LinguisticGraphVertex AnalysisGraph::nextMainPathVertex( toVisit.push(target(*outItr,*m_graph)); } + VertexTokenPropertyMap tokenMap = get( vertex_token, *m_graph ); // search while (!toVisit.empty()) { @@ -235,13 +240,27 @@ LinguisticGraphVertex AnalysisGraph::nextMainPathVertex( { return end; } + Token* ft = tokenMap[current]; accumulator-=in_degree(current,*m_graph); if (accumulator==0) { // check unique category only if accumulator is 0 MorphoSyntacticData* msd=get(vertex_data,*m_graph,current); - if (msd!=0 && msd->hasUniqueMicro(microAccessor,microFilters)) return current; + if (msd!=0 && msd->hasUniqueMicro(microAccessor,microFilters)) + { +#ifdef DEBUG_LP + LDEBUG << "AnalysisGraph::nextMainPathVertex micro, return" << current; +#endif + return current; + } + if (ft && ft->status().getStatus() == T_SENTENCE_BRK) + { +#ifdef DEBUG_LP + LDEBUG << "AnalysisGraph::nextMainPathVertex sentence break, return" << current; +#endif + return current; + } } accumulator+=out_degree(current,*m_graph); From 64685c75e9e776334c802a5d3bcf660ccdca844c Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 17 Feb 2016 15:53:18 +0100 Subject: [PATCH 25/82] Not propuce multiple same IndexElement in iterator --- .../BagOfWords/indexElementIterator.cpp | 469 +++++++++++------- 1 file changed, 290 insertions(+), 179 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp index 91d9cbc0b..ad5d0d45e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp @@ -26,6 +26,7 @@ #include "indexElement.h" #include "linguisticProcessing/common/BagOfWords/BoWRelation.h" +#include "linguisticProcessing/common/BagOfWords/bowTerm.h" #include "linguisticProcessing/common/BagOfWords/BoWPredicate.h" #include "indexElementIterator.h" #include "common/FsaAccess/AbstractLexiconIdGenerator.h" @@ -55,37 +56,39 @@ class IndexElementIteratorPrivate IndexElementIteratorPrivate(const IndexElementIteratorPrivate& ieip); ~IndexElementIteratorPrivate(); - typedef std::deque IndexElementQueue; - - // members - BoWText::const_iterator m_iterator; - BoWText::const_iterator m_iteratorEnd; - IndexElementQueue m_partQueue; - uint64_t m_maxSizeQueue; - uint64_t m_maxCompoundSize; - AbstractLexiconIdGenerator* m_idGenerator; - // private functions - - // add in queue - // (return false if size of queue becomes greater than max) - bool addInPartQueue(const uint64_t id, - const BoWType type, - const LimaString& word, - const uint64_t cat, - const uint64_t position, - const uint64_t length, - const Common::MediaticData::EntityType neType); + /** */ void getPositionLengthList(const std::vector& structure, Misc::PositionLengthList& poslenlist) const; - // add in queue: only used for compound elements + + /** Add @ref newElement in queue, only if queue size is lower than its maximum. + * Only used for compound elements + * @return true if the element has been added and false otherwise (size of queue would become + * greater than max) + */ bool addInPartQueue(const IndexElement& newElement); - void storePartsInQueue(boost::shared_ptr< BoWToken > token, const uint64_t rel); + /** Calls addPartElementsInQueue to recursively add @ref token parts and itself in the queue + */ + void storePartsInQueue(boost::shared_ptr< BoWToken > token); bool addPartElementsInQueue(boost::shared_ptr< BoWToken > token, std::pair, uint64_t> & ids_rels, const uint64_t rel); + + /** + * this function is recursive to build all composed elements that contains + * the head and all or parts of the extensions, for all possible values (ids) + * of head and extensions + * + * @param partIdsRels : the possible ids of each part, plus one relation per part + * @param head : the position of the head in the parts + * @param ids : the id list in which new ids are added for combined element + * @param structure : the current structure + * @param i : the current part looked at + * + * @return + */ bool addCombinedPartsInQueue(const Lima::Common::BagOfWords::BoWType type, const std::vector, uint64_t> >& partIds_Rels, const uint64_t head, @@ -95,6 +98,16 @@ class IndexElementIteratorPrivate std::vector& relations, const uint64_t i); + typedef std::deque IndexElementQueue; + + // members + BoWText::const_iterator m_iterator; + BoWText::const_iterator m_iteratorEnd; + IndexElementQueue m_partQueue; + uint64_t m_maxSizeQueue; + uint64_t m_maxCompoundSize; + AbstractLexiconIdGenerator* m_idGenerator; + QMap m_alreadyFoundElements; }; IndexElementIteratorPrivate::IndexElementIteratorPrivate(const BoWText& bowText, @@ -168,64 +181,109 @@ bool IndexElementIterator::isAtEnd() const // get current element ("dereference" iterator) //********************************************************************** // getting parts is done in this function (rather than in ++ function): -// which means that is a ++ is done before calling a getElement on +// which means that if a ++ is done before calling a getElement on // a complex token, no parts will be explored IndexElement IndexElementIterator::getElement() { +#ifdef DEBUG_CD BOWLOGINIT; - + LDEBUG << "IndexElementIterator::getElement empty:" << m_d->m_partQueue.empty(); +#endif + // If queue is empty + // - for simple tokens: a new index element is returned + // - for complex tokens : it is filled and then its front is returned if (m_d->m_partQueue.empty()) { if (m_d->m_iterator==m_d->m_iteratorEnd) { // at end +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement at end: return empty element"; +#endif return IndexElement(); // empty element has id 0 } else { - boost::shared_ptr< BoWToken> token; + boost::shared_ptr< BoWToken> token = boost::dynamic_pointer_cast((*m_d->m_iterator)); boost::shared_ptr< BoWPredicate > predicate; + switch ((*m_d->m_iterator)->getType()) { - case BOW_TOKEN: - { - token = boost::dynamic_pointer_cast((*m_d->m_iterator)); - uint64_t id=m_d->m_idGenerator->getId(token->getString()); - return IndexElement(id, - token->getType(), - token->getLemma(), - token->getCategory(), - token->getPosition(), - token->getLength() - ); - } - case BOW_TERM: - case BOW_NAMEDENTITY: - LDEBUG << "IndexElementIterator::getElement BOW_NAMEDENTITY" /*<< * (static_cast((*m_d->m_iterator)) ) << Lima::Common::MediaticData::MediaticData::single().getEntityName(static_cast((*m_d->m_iterator))->getNamedEntityType())*/; - // element itself will be stored in queue as part - m_d->storePartsInQueue(boost::dynamic_pointer_cast(*m_d->m_iterator),0); - return m_d->m_partQueue.front(); - // FIXME Change the handling of predicates to take into account their complex structure nature - case BOW_PREDICATE: - { - predicate = boost::dynamic_pointer_cast((*m_d->m_iterator)); - uint64_t id=m_d->m_idGenerator->getId(predicate->getString()); - return IndexElement(id, - predicate->getType(), - predicate->getString(), - 0, - predicate->getPosition(), - predicate->getLength(), - predicate->getPredicateType() - ); - } - case BOW_NOTYPE: - ; + case BOW_TOKEN: + { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement simple token:" << token->getIdUTF8String(); +#endif + if (!m_d->m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()), + IndexElement(m_d->m_idGenerator->getId(token->getString()), + token->getType(), + token->getLemma(), + token->getCategory(), + token->getPosition(), + token->getLength() + )); + } + return m_d->m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]; + } + case BOW_TERM: +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term:" << token->getIdUTF8String(); +#endif + m_d->storePartsInQueue(token); + if (m_d->m_partQueue.empty()) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term: part queue is empty" ; +#endif + (*this)++; + return getElement(); + } +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term after storePartsInQueue front is:" << m_d->m_partQueue.front(); +#endif + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),m_d->m_partQueue.front()); + return m_d->m_partQueue.front(); + + case BOW_NAMEDENTITY: +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement named entity:" << boost::dynamic_pointer_cast(*m_d->m_iterator)->getIdUTF8String() ;//<< Lima::Common::MediaticData::MediaticData::single().getEntityName(static_cast((*m_d->m_iterator))->getNamedEntityType()); + // element itself will be stored in queue as part +#endif + m_d->storePartsInQueue(token); +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement ne after storePartsInQueue front is:" << m_d->m_partQueue.front(); +#endif + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),m_d->m_partQueue.front()); + return m_d->m_partQueue.front(); + + // FIXME Change the handling of predicates to take into account their complex structure nature + case BOW_PREDICATE: + { + predicate = boost::dynamic_pointer_cast((*m_d->m_iterator)); + uint64_t id=m_d->m_idGenerator->getId(predicate->getString()); + return IndexElement(id, + predicate->getType(), + predicate->getString(), + 0, + predicate->getPosition(), + predicate->getLength(), + predicate->getPredicateType() + ); + } + case BOW_NOTYPE: + return IndexElement(); } } } + // Queue was not empty, returning its front else { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement empty:" << m_d->m_partQueue.empty() << "return part queue front" << m_d->m_partQueue.front(); +#endif return m_d->m_partQueue.front(); } + + // Unreachable return IndexElement(); // empty element has id 0 } @@ -234,15 +292,41 @@ IndexElement IndexElementIterator::getElement() //********************************************************************** IndexElementIterator& IndexElementIterator::operator++() { +#ifdef DEBUG_CD + BOWLOGINIT; +#endif + // If queue is empty, try to advance the text iterator to the next BoWToken + // Otherwose, pop the front element and advance the text iterator if the queue is now empty if (m_d->m_partQueue.empty()) { - if (m_d->m_iterator!=m_d->m_iteratorEnd) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ part queue is empty"; +#endif + if (m_d->m_iterator!=m_d->m_iteratorEnd) { m_d->m_iterator++; + // Jump already found elements +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ Jump if necessary"; +#endif + while (m_d->m_iterator != m_d->m_iteratorEnd && + boost::dynamic_pointer_cast((*m_d->m_iterator)) && + m_d->m_alreadyFoundElements.contains( QString::fromUtf8(boost::dynamic_pointer_cast((*m_d->m_iterator))->getIdUTF8String().c_str()) ) ) { + m_d->m_iterator++; + } } } else { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ part queue not empty"; +#endif m_d->m_partQueue.pop_front(); if (m_d->m_partQueue.empty()) { // finished for the parts of this token - m_d->m_iterator++; + m_d->m_iterator++; + // Jump already found elements + while (m_d->m_iterator != m_d->m_iteratorEnd && + boost::dynamic_pointer_cast((*m_d->m_iterator)) && + m_d->m_alreadyFoundElements.contains( QString::fromUtf8(boost::dynamic_pointer_cast((*m_d->m_iterator))->getIdUTF8String().c_str()) ) ) { + m_d->m_iterator++; + } } } return *this; @@ -258,29 +342,6 @@ IndexElementIterator IndexElementIterator::operator++(int) { //********************************************************************** // helper functions for iterator //********************************************************************** -bool IndexElementIteratorPrivate::addInPartQueue(const uint64_t id, - const BoWType type, - const LimaString& word, - const uint64_t cat, - const uint64_t position, - const uint64_t length, - const Common::MediaticData::EntityType neType) -{ - if (m_partQueue.size() >= m_maxSizeQueue) { - BOWLOGINIT; - LWARN << "size of queue exceeded"; - return false; - } - - m_partQueue.push_back(IndexElement(id,type,word,cat,position,length,neType)); -// BOWLOGINIT; -// LDEBUG << "add in part queue " << id << ":" -// << word -// << ";size of queue=" << m_partQueue.size() -// ; - return true; -} - void IndexElementIteratorPrivate::getPositionLengthList(const std::vector& structure, PositionLengthList& poslenlist) const { @@ -307,6 +368,10 @@ void IndexElementIteratorPrivate::getPositionLengthList(const std::vector= m_maxSizeQueue) { BOWLOGINIT; LWARN << "size of queue exceeded"; @@ -330,13 +395,16 @@ bool IndexElementIteratorPrivate::addInPartQueue(const IndexElement& newElement) } -void IndexElementIteratorPrivate::storePartsInQueue(boost::shared_ptr< Lima::Common::BagOfWords::BoWToken > token, const uint64_t rel) +void IndexElementIteratorPrivate::storePartsInQueue(boost::shared_ptr< Lima::Common::BagOfWords::BoWToken > token) { +#ifdef DEBUG_CD + BOWLOGINIT; + LDEBUG << "IndexElementIteratorPrivate::storePartsInQueue" << token->getIdUTF8String(); +#endif pair, uint64_t> tokenIds; - if (!addPartElementsInQueue(token,tokenIds,rel)) { + if (!addPartElementsInQueue(token,tokenIds,0)) { BOWLOGINIT; - LWARN << "Token contain too many subparts (some are ignored): " - << token->getLemma(); + LWARN << "Token contain too many subparts (some are ignored): " << token->getLemma(); } } @@ -344,40 +412,50 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT pair, uint64_t>& ids_rel, uint64_t rel) { -// BOWLOGINIT; -// LDEBUG << "addPartElementsInQueue:" << token->getLemma() << ", rel=" << rel; +#ifdef DEBUG_CD + BOWLOGINIT; + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue" << token->getIdUTF8String() << rel; +#endif Common::MediaticData::EntityType neType; - - + bool result = false; switch (token->getType()) { - case BOW_TOKEN: - { - // simple token : get Id and push in parts - uint64_t id=m_idGenerator->getId(token->getString()); - ids_rel=make_pair(vector(1,id),rel); - - LimaString lemma=token->getLemma(); - if (lemma.size()==0) { - lemma=token->getInflectedForm(); + case BOW_TOKEN: + { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue simple token:" << token->getIdUTF8String(); +#endif + if (!m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + LimaString lemma=token->getLemma(); + if (lemma.size()==0) { + lemma=token->getInflectedForm(); + } + // simple token : get Id and push in parts + uint64_t id=m_idGenerator->getId(token->getString()); + + m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),IndexElement(id, + token->getType(), + lemma, + token->getCategory(), + token->getPosition(), + token->getLength(), + neType)); + result = addInPartQueue(m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]); + } else { + result = true; + } + ids_rel=make_pair(vector(1,m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())].getId()),rel); + return result; } - - return addInPartQueue(id, - token->getType(), - lemma, - token->getCategory(), - token->getPosition(), - token->getLength(), - neType); - } - case BOW_NAMEDENTITY: - neType=boost::dynamic_pointer_cast(token)->getNamedEntityType(); - break; - case BOW_TERM: - case BOW_PREDICATE: - case BOW_NOTYPE: - default:; + case BOW_NAMEDENTITY: + neType=boost::dynamic_pointer_cast(token)->getNamedEntityType(); + break; + case BOW_TERM: + case BOW_PREDICATE: + case BOW_NOTYPE: + default:; } // is a complex token @@ -390,31 +468,47 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT return false; } - if (complexToken->size() == 1) { + if (complexToken->size() == 1) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue complex token of size one"; +#endif // only one part, do not get into it // (for instance, named entity with one element) // push simple token in parts - uint64_t id=m_idGenerator->getId(token->getString()); - ids_rel=make_pair(vector(1,id),rel); + if (!m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + uint64_t id=m_idGenerator->getId(token->getString()); + ids_rel=make_pair(vector(1,id),rel); - LimaString lemma=token->getLemma(); - if (lemma.size()==0) { - lemma=token->getInflectedForm(); - } - return addInPartQueue(id, + LimaString lemma=token->getLemma(); + if (lemma.size()==0) { + lemma=token->getInflectedForm(); + } + m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()), IndexElement(id, token->getType(), lemma, token->getCategory(), token->getPosition(), token->getLength(), - neType); + neType)); + result = addInPartQueue(m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]); + } else { + return result = true; + } + return result; } - - ids_rel=make_pair(vector(0),rel); + +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue complex token of size" << complexToken->size(); +#endif + ids_rel=make_pair(vector(),rel); uint64_t nbParts=complexToken->getParts().size(); uint64_t head=complexToken->getHead(); vector, uint64_t> > partIdsRels(nbParts); for (uint64_t i=0; i, uint64_t>& thisPartIdsRels=partIdsRels[i]; uint64_t relType; boost::shared_ptr< BoWRelation > relation=(complexToken->getParts()[i]).getBoWRelation(); @@ -422,11 +516,15 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT if (!addPartElementsInQueue(complexToken->getParts()[i].getBoWToken(),thisPartIdsRels,relType)) { return false; } + if (i==head) { // add ids of the head ids_rel.first.insert(ids_rel.first.end(),thisPartIdsRels.first.begin(),thisPartIdsRels.first.end()); } } +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue parts added; combining them"; +#endif // add ids for combined parts vector structure; //current structure in recursive function vector relations; //current relations in recursive function @@ -436,19 +534,6 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT return true; } -/** - * this function is recursive to build all composed elements that contains - * the head and all or parts of the extensions, for all possible values (ids) - * of head and extensions - * - * @param partIdsRels : the possible ids of each part, plus one relation per part - * @param head : the position of the head in the parts - * @param ids : the id list in which new ids are added for combined element - * @param structure : the current structure - * @param i : the current part looked at - * - * @return - */ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( const Lima::Common::BagOfWords::BoWType type, const std::vector, uint64_t> >& partIdsRels, @@ -457,55 +542,78 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( std::pair, uint64_t>& ids_rel, std::vector& structure, std::vector& relations, - const uint64_t i) + const uint64_t current) { -// BOWLOGINIT; -// if (logger.isDebugEnabled()) { -// ostringstream oss; -// for (vector >::const_iterator it=structure.begin(), -// it_end=structure.end(); it!=it_end; it++) { -// oss << (*it).first << "/" << (*it).second << ";"; -// } -// LDEBUG << "addCombinedPartsInQueue: nb parts=" << partIdsRels.size() -// << ", head=" << head << ", current=" << i << ",structure=" << oss.str(); -// } - - if (i>=partIdsRels.size()) { +#ifdef DEBUG_CD + BOWLOGINIT; +#endif + QStringList structureKey; + for (auto element: structure) { + structureKey << QString::number(element); + } +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: nb parts=" << partIdsRels.size() + << ", head=" << head << ", current=" << current << ", structure=" << structureKey.join(";"); +#endif + bool result = false; + if (current>=partIdsRels.size()) { if (structure.size() == 1) { //just the head: is already in queue +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: just the head: is already in queue"; +#endif return true; } // build indexElement before getting the id : allow to have the // true size of compound (trick: use PositionLengthList to have // the size: number of leaves of the structure), and to avoid // compute the id if size is more than maxCompoundSize - IndexElement compoundElement(0,type,structure,relations,neType); - getPositionLengthList(structure,compoundElement.getPositionLengthList()); - if (compoundElement.getPositionLengthList().size() > m_maxCompoundSize) { - // compound larger than allowed, do not add it in parts, but - // return true anyway (false is reserved for queue size - // overflow) - return true; - } - // at end of parts => add current structure - - uint64_t id=m_idGenerator->getId(structure); -// BOWLOGINIT; -// LDEBUG << "IndexElementIterator: get id from generator " << id; - compoundElement.setId(id); - if (!addInPartQueue(compoundElement)) { - return false; + if (!m_alreadyFoundElements.contains(structureKey.join(";"))) + { + IndexElement compoundElement(0,type,structure,relations,neType); + getPositionLengthList(structure,compoundElement.getPositionLengthList()); + if (compoundElement.getPositionLengthList().size() > m_maxCompoundSize) { + // compound larger than allowed, do not add it in parts, but + // return true anyway (false is reserved for queue overflow) +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: just the head: max compound size exceeded"; +#endif + return true; + } + // at end of parts => add current structure + + uint64_t id=m_idGenerator->getId(structure); +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator: got id from generator " << id; +#endif + compoundElement.setId(id); + m_alreadyFoundElements.insert(structureKey.join(";"),compoundElement); + if (!addInPartQueue(m_alreadyFoundElements[structureKey.join(";")])) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: queue overflow"; +#endif + return false; + } else { + result = true; + } + } else { + result = true; } - ids_rel.first.push_back(id); - return true; + ids_rel.first.push_back(m_alreadyFoundElements[structureKey.join(";")].getId()); +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: added to ids_rel.first: " << m_alreadyFoundElements[structureKey.join(";")].getId() << "; return" << result; +#endif + return result; } // add possible at end of structure and recursive call - for (auto it=partIdsRels[i].first.begin(),it_end=partIdsRels[i].first.end(); - it!=it_end; it++) { - structure.push_back(*it); - relations.push_back(partIdsRels[i].second); - if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,i+1)) { + for (auto it:partIdsRels[current].first) { + structure.push_back(it); + relations.push_back(partIdsRels[current].second); + if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,current+1)) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: recursive call returned false"; +#endif return false; } structure.pop_back(); @@ -514,8 +622,11 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( // if head, stop here: current iterator is head, hence always added // otherwise, recursive call without current iterator (that is an // extension) - if (i!=head) { - if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,i+1)) { + if (current!=head) { + if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,current+1)) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: second recursive call returned false"; +#endif return false; } } From 8c69c23905158eadb97860898fdfee9366e3b964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Tue, 22 Mar 2016 11:10:13 +0100 Subject: [PATCH 26/82] Correct lemma of pronouns in fre dictionary Solves issue #39 --- .../fre/lefff/lefff-ext-lima.dic | 212 +++++++++--------- 1 file changed, 106 insertions(+), 106 deletions(-) diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic index 853bc4762..62b87e12e 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic @@ -38,46 +38,46 @@ $ mâle Afha--- -> -> Ff -> -> Ff -ce ce Ppcsa---- --elle cln Ppcsa3-fs --elles cln Ppcsa3-fp --en clg Ppcda1-ms --en cll Ppcda1-ms --il cln Ppcsa3-ms +-elle elle Ppcsa3-fs +-elles elle Ppcsa3-fp +-en en Ppcda1-ms +-en en Ppcda1-ms +-il il Ppcsa3-ms -il ilimp Ppcsa---- --ils cln Ppcsa3-mp --je cln Ppcsa1-ms --la cla Ppcda3-fs --le cla Ppcda3-ms --les cla Ppcda3-mp --leur cld Ppcda3-mp --lui cld Ppcda3-ms --m' cld Ppcda1-ms --moi cla Ppcda1-ms --moi cld Ppcda1-ms --nous cla Ppcda1-mp --nous cld Ppcda1-mp --nous cln Ppcsa1-mp --on cln Ppcsa3-ms --t' cld Ppcda2-ms --t-elle cln Ppcsa3-fs --t-elles cln Ppcsa3-fp --t-en cll Ppcda1-ms --t-il cln Ppcsa3-ms --t-il ilimp Ppcsa---- --t-ils cln Ppcsa3-mp --t-on cln Ppcsa3-ms --t-y cld Ppcda1-ms --toi cla Ppcda2-ms --toi cld Ppcda2-ms --tu cln Ppcsa2-ms --vous cla Ppcda2-mp --vous cld Ppcda2-mp --vous cln Ppcsa2-mp --vs cla Ppcda2-mp --vs cld Ppcda2-mp --vs cln Ppcsa2-mp --y cld Ppcda1-ms --y cll Ppcda1-ms +-ils ils Ppcsa3-mp +-je je Ppcsa1-ms +-la la Ppcda3-fs +-le le Ppcda3-ms +-les les Ppcda3-mp +-leur leur Ppcda3-mp +-lui lui Ppcda3-ms +-m' me Ppcda1-ms +-moi moi Ppcda1-ms +-moi moi Ppcda1-ms +-nous nous Ppcda1-mp +-nous nous Ppcda1-mp +-nous nous Ppcsa1-mp +-on on Ppcsa3-ms +-t' te Ppcda2-ms +-t-elle elle Ppcsa3-fs +-t-elles elle Ppcsa3-fp +-t-en en Ppcda1-ms +-t-il il Ppcsa3-ms +-t-il il Ppcsa---- +-t-ils il Ppcsa3-mp +-t-on on Ppcsa3-ms +-t-y y Ppcda1-ms +-toi toi Ppcda2-ms +-toi toi Ppcda2-ms +-tu tu Ppcsa2-ms +-vous vous Ppcda2-mp +-vous vous Ppcda2-mp +-vous vous Ppcsa2-mp +-vs vous Ppcda2-mp +-vs vous Ppcda2-mp +-vs vous Ppcsa2-mp +-y y Ppcda1-ms +-y y Ppcda1-ms . . Ff ... ... Fa / ou Cc @@ -135127,7 +135127,7 @@ cetteputainde ceputainde Dd--fs- ceux celui Pd-----mp ceux-ci celui-ci Pd-----mp ceux-là celui-là Pd-----mp -ch' cln Ppcsa1-ms +ch' je Ppcsa1-ms ch'ti petit Afha-ms ch. chapitre Ncgf-- cha-cha-cha cha-cha-cha Ncgm-- @@ -253165,8 +253165,8 @@ eldorado eldorado Ncgms- eldorados eldorado Ncgmp- elfe elfe Ncgms- elfes elfe Ncgmp- -elle cln Ppcsa3-fs -elles cln Ppcsa3-fp +elle elle Ppcsa3-fs +elles elle Ppcsa3-fp ellipse ellipse Ncgfs- ellipses ellipse Ncgfp- ellipsoïdal ellipsoïdal Afha-ms @@ -260142,8 +260142,8 @@ empêtrées empêtré Afha-fp empêtrés empêtrer Vppsi-mp empêtrés empêtrer Vppsp-mp empêtrés empêtré Afha-mp -en clg Ppcda1-ms -en cll Ppcda1-ms +en en Ppcda1-ms +en en Ppcda1-ms en en Sg en-cours en-cours Ncgm-- en-deçà en-deçà Rg @@ -336718,7 +336718,7 @@ ijawe ijaw Afha-fs ijawes ijaw Afha-fp ijaws ijaw Afha-mp ijaws ijaw Ncgmp- -il cln Ppcsa3-ms +il il Ppcsa3-ms il ilimp Ppcsa---- ilang ilang Ncgms- ilang-ilang ilang-ilang Ncgms- @@ -337131,7 +337131,7 @@ illégitimité illégitimité Ncgfs- illégitimités illégitimité Ncgfp- ilote ilote Ncgms- ilotes ilote Ncgmp- -ils cln Ppcsa3-mp +ils ils Ppcsa3-mp ilya ilya Sg iléal iléal Afha-ms iléale iléal Afha-fs @@ -356566,7 +356566,7 @@ ixia ixia Ncgfs- ixias ixia Ncgfp- ixode ixode Ncgms- ixodes ixode Ncgmp- -j' cln Ppcsa1-ms +j' je Ppcsa1-ms jabiru jabiru Ncgms- jabirus jabiru Ncgmp- jabla jabler Vpisi3-s @@ -358158,7 +358158,7 @@ jdanoviennes jdanovien Afha-fp jdanoviens jdanovien Afha-mp jdanovo-maoïste jdanovo-maoïste Afha--- jdanovo-maoïstes jdanovo-maoïste Afha--- -je cln Ppcsa1-ms +je je Ppcsa1-ms je-m'en-fichisme je-m'en-fichisme Ncgms- je-m'en-fichismes je-m'en-fichisme Ncgmp- je-m'en-fichiste je-m'en-fichiste Afha--- @@ -361520,14 +361520,14 @@ kérogène kérogène Ncgms- kérogènes kérogène Ncgmp- kérosène kérosène Ncgms- kérosènes kérosène Ncgmp- -l' cla Ppcda3-fs -l' cla Ppcda3-ms +l' la Ppcda3-fs +l' le Ppcda3-ms l' le Da--msd l'autre l'autre Pi-----ms -l'on cln Ppcsa3-ms +l'on on Ppcsa3-ms l'un l'un Pi-----ms l'une l'un Pi-----fs -la cla Ppcda3-fs +la la Ppcda3-fs la le Da--fsd laVarenne-Saint-Hilaire La-Varenne-Saint-Hilaire Npgfs- laVarenne-St-Hilaire La-Varenne-Saint-Hilaire Npgfs- @@ -365083,7 +365083,7 @@ laïussé laïusser Vppsi-ms laïussée laïusser Vppsi-fs laïussées laïusser Vppsi-fp laïussés laïusser Vppsi-mp -le cla Ppcda3-ms +le la Ppcda3-ms le le Da--msd le__det le Da--msd leader leader Ncgms- @@ -365261,7 +365261,7 @@ lequel lequel Pr-n---ms lequel lequel Pt-d---ms lequel lequel Pt-n---ms lerche lerche Rg -les cla Ppcda3-mp +les le Ppcda3-mp les le Da--mpd les__det le Da--mpd lesautres l'autre Pi-----mp @@ -365566,7 +365566,7 @@ leucémique leucémique Ncgms- leucémiques leucémique Ncgmp- leude leude Ncgms- leudes leude Ncgmp- -leur cld Ppcda3-mp +leur sien Ppcda3-mp leur son Ds3pms- leurra leurrer Vpisi3-s leurra leurrer Vpisp3-s @@ -370716,10 +370716,10 @@ lugé luger Vppsi-ms lugée luger Vppsi-fs lugées luger Vppsi-fp lugés luger Vppsi-mp -lui cld Ppcda3-ms -lui-même cln Ppcsa3-ms -lui-même cln Px---1-ms -lui-même cln Px---3-ms +lui lui Ppcda3-ms +lui-même lui-même Ppcsa3-ms +lui-même lui-même Px---1-ms +lui-même lui-même Px---3-ms luira luire Vpifi3-s luirai luire Vpifi1-s luiraient luire Vpici3-p @@ -372751,9 +372751,9 @@ lût lire Vpsii3-s lût lire Vpsip3-s lûtes lire Vpisi2-p lûtes lire Vpisp2-p -m' cla Ppcda1-ms -m' cld Ppcda1-ms -m' clr Px---1-ms +m' m' Ppcda1-ms +m' m' Ppcda1-ms +m' m' Px---1-ms m'as-tu-vu m'as-tu-vu Ncgm-- m'as-tu-vue m'as-tu-vu Ncgf-- ma son Ds1sfs- @@ -381459,9 +381459,9 @@ maïserie maïserie Ncgfs- maïseries maïserie Ncgfp- maïzena maïzena Ncgfs- maïzenas maïzena Ncgfp- -me cla Ppcda1-ms -me cld Ppcda1-ms -me clr Px---1-ms +me me Ppcda1-ms +me me Ppcda1-ms +me me Px---1-ms mea-culpa mea-culpa Ncgm-- meaculpa meaculpa Ncgm-- meau meau Ncgmp- @@ -386921,8 +386921,8 @@ mogols mogol Afha-mp mogols mogol Ncgmp- mohair mohair Ncgms- mohairs mohair Ncgmp- -moi cla Ppcda1-ms -moi cld Ppcda1-ms +moi moi Ppcda1-ms +moi moi Ppcda1-ms moi moi Ncgms- moie moie Ncgfs- moies moie Ncgfp- @@ -402507,10 +402507,10 @@ nourrît nourrir Vpsii3-s nourrît nourrir Vpsip3-s nourrîtes nourrir Vpisi2-p nourrîtes nourrir Vpisp2-p -nous cla Ppcda1-mp -nous cld Ppcda1-mp -nous cln Ppcsa1-mp -nous clr Px---1-mp +nous nous Ppcda1-mp +nous nous Ppcda1-mp +nous nous Ppcsa1-mp +nous nous Px---1-mp nouure nouure Ncgfs- nouures nouure Ncgfp- nouveau nouveau Afha-ms @@ -408190,7 +408190,7 @@ omît omettre Vpsii3-s omît omettre Vpsip3-s omîtes omettre Vpisi2-p omîtes omettre Vpisp2-p -on cln Ppcsa3-ms +on on Ppcsa3-ms on-dit on-dit Ncgm-- on-line on-line Afha--- on-lines on-line Afha--- @@ -527525,12 +527525,12 @@ rôtîtes rôtir Vpisi2-p rôtîtes rôtir Vpisp2-p röntgen röntgen Ncgms- röntgens röntgen Ncgmp- -s' clar Px---3-mp -s' clar Px---3-ms -s' cldr Px---3-mp -s' cldr Px---3-ms -s' clr Px---3-mp -s' clr Px---3-ms +s' se Px---3-mp +s' se Px---3-ms +s' se Px---3-mp +s' se Px---3-ms +s' se Px---3-mp +s' se Px---3-ms s'agissant s'agissant Sg s'ilteplaît s'ilteplaît Rg s'ilvousplaît s'ilvousplaît Rg @@ -535304,12 +535304,12 @@ scénographique scénographique Afha--- scénographiques scénographique Afha--- scénologie scénologie Ncgfs- scénologies scénologie Ncgfp- -se clar Px---3-mp -se clar Px---3-ms -se cldr Px---3-mp -se cldr Px---3-ms -se clr Px---3-mp -se clr Px---3-ms +se se Px---3-mp +se se Px---3-ms +se se Px---3-mp +se se Px---3-ms +se se Px---3-mp +se se Px---3-ms seau seau Ncgms- sebka sebka Ncgfs- sebkas sebka Ncgfp- @@ -563720,10 +563720,10 @@ sût savoir Vpsii3-s sût savoir Vpsip3-s sûtes savoir Vpisi2-p sûtes savoir Vpisp2-p -t' cla Ppcda2-ms -t' cld Ppcda2-ms -t' cln Ppcsa2-ms -t' clr Px---2-ms +t' tu Ppcda2-ms +t' tu Ppcda2-ms +t' tu Ppcsa2-ms +t' tu Px---2-ms t-shirt t-shirt Ncgms- t-shirts t-shirt Ncgmp- t. tome Ncgm-- @@ -567997,9 +567997,9 @@ tchétchène tchétchène Afha--- tchétchène tchétchène Ncgms- tchétchènes tchétchène Afha--- tchétchènes tchétchène Ncgmp- -te cla Ppcda2-ms -te cld Ppcda2-ms -te clr Px---2-ms +te te Ppcda2-ms +te te Ppcda2-ms +te te Px---2-ms technicien technicien Ncgms- technicienne technicien Ncgfs- techniciennes technicien Ncgfp- @@ -573000,8 +573000,8 @@ togolaises togolais Afha-fp togolaises togolais Ncgfp- togolo-_ togolo-_1 Ep tohu-bohu tohu-bohu Ncgm-- -toi cla Ppcda2-ms -toi cld Ppcda2-ms +toi toi Ppcda2-ms +toi toi Ppcda2-ms toi toi Ncgms- toilage toilage Ncgms- toilages toilage Ncgmp- @@ -585387,7 +585387,7 @@ tsé-tsé tsé-tsé Ncgf-- tt tout Afha-ms tte tout Afha-fs ttes tout Afha-fp -tu cln Ppcsa2-ms +tu tu Ppcsa2-ms tu taire Vppsi-ms tu taire Vppsm-ms tu taire Vppsp-ms @@ -602776,10 +602776,10 @@ vouons vouer Vpipp1-p vouons vouer Vpmpp1-p vouons vouer Vpipt1-p vouons vouer Vpmpt1-p -vous cla Ppcda2-mp -vous cld Ppcda2-mp -vous cln Ppcsa2-mp -vous clr Px---2-mp +vous vous Ppcda2-mp +vous vous Ppcda2-mp +vous vous Ppcsa2-mp +vous vous Px---2-mp vousoie vousoyer Vpipt3-s vousoie vousoyer Vpmpt2-s vousoiement vousoiement Ncgms- @@ -603432,12 +603432,12 @@ vrombîmes vrombir Vpisi1-p vrombît vrombir Vpsii3-s vrombîtes vrombir Vpisi2-p vroum vroum I -vs cla Ppcda2-mp -vs cld Ppcda2-mp -vs cln Ppcsa2-mp -vs clr Px---2-mp -vs vs Sg -vs. vs Sg +vs vous Ppcda2-mp +vs vous Ppcda2-mp +vs vous Ppcsa2-mp +vs vous Px---2-mp +vs versus Sg +vs. versus Sg vu voir Vppsi-ms vu voir Vppsm-ms vu voir Vppsp-ms @@ -605098,8 +605098,8 @@ xérophytique xérophytique Afha--- xérophytiques xérophytique Afha--- xérus xérus Ncgm-- xérès xérès Ncgm-- -y cld Ppcda1-ms -y cll Ppcda1-ms +y y Ppcda1-ms +y y Ppcda1-ms yacht yacht Ncgms- yacht-club yacht-club Ncgms- yacht-clubs yacht-club Ncgmp- From 2ed4deb5348039fd2eb4d5c20b3dc44e32e40693 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 12 Apr 2016 14:36:11 +0200 Subject: [PATCH 27/82] Better config for SRL --- lima_linguisticprocessing/conf/lima-lp-eng.xml | 8 +++++--- .../core/AnalysisDumpers/ConllDumper.cpp | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-eng.xml b/lima_linguisticprocessing/conf/lima-lp-eng.xml index 1ff773587..6c46f1503 100644 --- a/lima_linguisticprocessing/conf/lima-lp-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-eng.xml @@ -704,7 +704,7 @@ - + @@ -780,10 +780,12 @@ - + + + + - diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp index 3170c8b3a..aacae883f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp @@ -184,10 +184,10 @@ void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructur } while (ifs.good() && !ifs.eof()) { - std::string line; - while(getline(ifs, line)) // as long as we can put the line on "line" + std::string line = Lima::Common::Misc::readLine(ifs); + QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); + if (strs.size() == 2) { - QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); m_d->m_conllLimaDepMapping.insert(strs[0],strs[1]); } } From a8a386f1e336ba3cd90305a273a4b15b2a79aa7a Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 12 Apr 2016 15:27:25 +0200 Subject: [PATCH 28/82] Change the number of cols in SRL output Set this number as a define to ease up its later change. --- .../SemanticRoleLabelingLoader.cpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index 736bfc0fd..4a933dfa3 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -64,6 +64,9 @@ namespace SemanticAnalysis { SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); +#define NBCOLSINSRLBEFOREFRAME 10 + + // Conll handler struct ConllHandler { @@ -266,11 +269,11 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); QString firstSentenceToken=(*sentenceTokens.constBegin()); int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); - m_verbalClassNb = descriptorsNb -11; + m_verbalClassNb = descriptorsNb -NBCOLSINSRLBEFOREFRAME; int classIndex=0; if (m_verbalClassNb > 0) { - LDEBUG << "ConllHandler::extractSemanticInformation" << sentenceI << " : \n" << sent ; + LDEBUG << "ConllHandler::extractSemanticInformation" << m_verbalClassNb << sentenceI << " : \n" << sent ; m_verbalClasses.clear(); m_verbalClasses.resize(m_verbalClassNb); m_semanticRoles.clear(); @@ -281,14 +284,14 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap { int roleNumbers=0; QStringList descriptors=cHandler.splitSegment((*tokensIterator),m_descriptorSeparator); - if (descriptors.size()>=11+m_verbalClassNb) + if (descriptors.size()>=NBCOLSINSRLBEFOREFRAME+m_verbalClassNb) { int conllTokenId=descriptors[0].toInt(); QString conllToken=descriptors[1]; - if(descriptors[10]!="-") + if(descriptors[NBCOLSINSRLBEFOREFRAME]!="_") { - QString verbalClass=descriptors[10]; - QString vClass=descriptors[10]; + QString verbalClass=descriptors[NBCOLSINSRLBEFOREFRAME]; + QString vClass=descriptors[NBCOLSINSRLBEFOREFRAME]; LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if (classIndex >= m_verbalClasses.size()) { @@ -301,14 +304,14 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap for (int roleTargetFieldIndex=0; roleTargetFieldIndex= descriptors.size()) + if (NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex >= descriptors.size()) { LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; break; } - if (descriptors[11+roleTargetFieldIndex]!="-") + if (descriptors[NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex]!="_") { - QString semanticRoleLabel=descriptors[11+roleTargetFieldIndex]; + QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex]; LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if(limaTokenId!=0) From 133dd67910b817c0be2e94f744b73454ddad7a86 Mon Sep 17 00:00:00 2001 From: Simon Marchal Date: Tue, 17 May 2016 11:10:48 +0200 Subject: [PATCH 29/82] Fix position/length calculation --- .../BagOfWords/indexElementIterator.cpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp index ad5d0d45e..9597eddfb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp @@ -347,20 +347,23 @@ void IndexElementIteratorPrivate::getPositionLengthList(const std::vector::const_iterator - it=structure.begin(),it_end=structure.end(); - for (std::deque::const_iterator - elt=m_partQueue.begin(),elt_end=m_partQueue.end(); - elt!=elt_end; elt++) { - if ((*elt).getId()==*it) { - const PositionLengthList& p=(*elt).getPositionLengthList(); - poslenlist.insert(poslenlist.end(),p.begin(),p.end()); - it++; - if (it==it_end) { - break; - } + for (std::vector::const_iterator it = structure.begin(); it != structure.end(); ++it) { + + QMap::const_iterator found = m_alreadyFoundElements.begin(); + while (found != m_alreadyFoundElements.end() && *it != found.value().getId()) { + ++found; + } + + if (found != m_alreadyFoundElements.end()) { + const PositionLengthList& p = found.value().getPositionLengthList(); + poslenlist.insert(poslenlist.end(), p.begin(), p.end()); + } + else { + BOWLOGINIT + LERROR << "getPositionLengthList failure: element id " << *it << " not found"; } } + // sort positions std::sort(poslenlist.begin(),poslenlist.end()); } From 234a31ccd91290f5c13cc567630219d404e40693 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 24 May 2016 12:00:22 +0200 Subject: [PATCH 30/82] Add aux to the list of syntactic relations --- lima_common/conf/lima-common-fre.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lima_common/conf/lima-common-fre.xml b/lima_common/conf/lima-common-fre.xml index 9a3a667a8..aef8d8918 100644 --- a/lima_common/conf/lima-common-fre.xml +++ b/lima_common/conf/lima-common-fre.xml @@ -180,6 +180,10 @@ + + + + From 817ed654ad0df11643790c498c11fe312bdcdb7c Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 24 May 2016 12:00:54 +0200 Subject: [PATCH 31/82] Revert the change from TEMPCOMP to aux This change should have been limited to master --- .../core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp index c645fc5da..06746e98f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp @@ -732,7 +732,7 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); #ifdef DEBUG_LP LDEBUG << "CreateCompoundTense::CreateCompoundTense() m_tempCompType" << m_tempCompType; #endif From 128162447c9598bc32bed7a0d930ccb479f4f903 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 11 Apr 2016 13:56:44 +0200 Subject: [PATCH 32/82] Set the numeric status at first digit --- .../fre/tokenizerAutomaton-fre.tok | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok index c4770b8e2..a9ad88f07 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok @@ -25,7 +25,6 @@ - c_quote c_b c_del1 > ALPHA - c_quote c_del1 > ALPHA - unknwn > ALPHA - - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_del = START (T_WORD_BRK) - c_quote = START - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line > DELIMITER (T_WORD_BRK) @@ -39,7 +38,6 @@ - c_V > CARDINAL_ROMAN (T_NUMERIC,T_CARDINAL_ROMAN) - c_M > SINGLE_UPPER (T_CAPITAL,T_ALPHA) - c_m > ALL_LOWER (T_ALPHA,T_SMALL) - - c_5 > ALPHANUMERIC - c_hyphen|c_plus c_5 > INTEGER (T_NUMERIC,T_INTEGER) - m_pattern > PATTERN (T_PATTERN) - c_lowline > START @@ -107,7 +105,7 @@ - m_eof = END - c_dot c_dot c_dot = SUSP1 (T_SENTENCE_BRK) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|c_dot|m_line = DELIMITER (T_WORD_BRK) - - c_5 = INTEGER + - c_5 = INTEGER (T_NUMERIC,T_INTEGER) - c_par = IGNORE (T_SENTENCE_BRK) - c_b = IGNORE - c_V = CARDINAL_ROMAN (T_NUMERIC,T_CARDINAL_ROMAN) @@ -128,7 +126,6 @@ - c_quote c_b c_del1 = ALPHA - c_quote c_del1 = ALPHA - unknwn = ALPHA - - c_5 = ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_del = START (T_WORD_BRK) - c_quote = START - c_all = START @@ -139,7 +136,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_M > ALL_UPPER (T_CAPITAL) - c_m > LOWER_1ST_UPPER - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_dot c_Mm c_dot > ACRONYM_1 (T_ACRONYM) - c_dot c_b > ABBREV (T_ABBREV) - c_b = IGNORE @@ -156,7 +153,7 @@ - c_hyphen c_a_t c_hyphen = TEUPHOT (T_ALPHA) - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_M > ALL_UPPER - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_m > LOWER_UPPER (T_CAPITAL_SMALL) - c_b = IGNORE - c_hyphen c_M > ALL_UPPER (T_HYPHEN_WORD) @@ -174,7 +171,7 @@ - c_quote c_Mm > APOS - c_m > ALL_LOWER - c_M > LOWER_UPPER (T_CAPITAL_SMALL) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_b = IGNORE - c_hyphen c_M > LOWER_UPPER (T_HYPHEN_WORD) - c_hyphen c_m > ALL_LOWER (T_HYPHEN_WORD) @@ -193,7 +190,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_m > LOWER_1ST_UPPER - c_M > LOWER_UPPER (T_CAPITAL_SMALL) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_Mm > APOS - c_hyphen c_M > LOWER_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) - c_hyphen c_m > LOWER_1ST_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) @@ -210,7 +207,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_Mm > LOWER_UPPER - c_b = IGNORE - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_hyphen c_Mm > LOWER_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line = DELIMITER (T_WORD_BRK) - c_del2|c_dot = DELIMITER (T_SENTENCE_BRK) @@ -251,7 +248,7 @@ - c_l_eg c_l_m c_l_e | c_l_e c_l_m c_l_e > ORDINAL_ROMAN2 (T_NUMERIC,T_ORDINAL_ROMAN) - c_l_n c_l_d | c_l_e c_l_r > ORDINAL_ROMAN1 (T_NUMERIC,T_ORDINAL_ROMAN) - c_l_eg > ORDINAL_ROMAN (T_NUMERIC,T_ORDINAL_ROMAN) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_dot c_b|c_M > SINGLE_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL_1ST) - [c_del|m_line|m_eof] [c_M] c_m > LOWER_1ST_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL_1ST) - [c_del|m_line|m_eof] [c_M] c_M > ALL_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL) From 2cff9735d12348cc11251b3fbddd9adfb98d14d3 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 11 Apr 2016 14:36:09 +0200 Subject: [PATCH 33/82] Correct tokenization and norm. of negative numbers --- .../fre/tokenizerAutomaton-fre.tok | 2 +- .../data/test-fre.default.xml | 65 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok index a9ad88f07..06b98cd19 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok @@ -79,7 +79,7 @@ - c_M / SINGLE_UPPER (T_ALPHA,T_CAPITAL_1ST) - c_m / ALL_LOWER (T_ALPHA,T_SMALL) - c_5 / INTEGER (T_NUMERIC,T_INTEGER) - - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC) + - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC,T_INTEGER) - m_pattern m_pattern / PATTERN (T_PATTERN) - c_lowline / START - c_other / START diff --git a/lima_linguisticprocessing/data/test-fre.default.xml b/lima_linguisticprocessing/data/test-fre.default.xml index f1538722e..882f2d1ac 100644 --- a/lima_linguisticprocessing/data/test-fre.default.xml +++ b/lima_linguisticprocessing/data/test-fre.default.xml @@ -176,4 +176,69 @@ operator="contains" right="ADJ"/> + + + + + + + + + euritrack est un mot inconnu tout en minuscules, doit être normalisé 'euritrack' + + + + 'euri100' is an unknown word starting by lowercase letters and ending by digits ; it must be normalized in itself 'euri100'. + + + + + + + + + + + The number in digits 27 must be normalized into itself. + + + + + + + + + + + The negative numbers in digits -27 and -25 must be normalized into themselves. + + + + + + + + + + From 8e5ef72db0c56cdb9d4d5a144641589c844dd5bc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 11 Apr 2016 14:37:23 +0200 Subject: [PATCH 34/82] Remove dico entries for numbers in digits Freeling contained dictionary entries giving "two" as lemma of "2" (etc.). This is not desirable in LIMA as everything concerning numbers is handled in later steps. --- .../eng/freeling/numbers.dic | 102 ------------------ 1 file changed, 102 deletions(-) diff --git a/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic b/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic index 9a0317ebb..5e8716016 100644 --- a/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic +++ b/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic @@ -98,105 +98,3 @@ twenty-six twenty-six CD twenty-three twenty-three CD twenty-two twenty-two CD two two CD - -8 eight CD -18 eighteen CD -80 eighty CD -88 eighty-eight CD -85 eighty-five CD -84 eighty-four CD -89 eighty-nine CD -81 eighty-one CD -87 eighty-seven CD -86 eighty-six CD -83 eighty-three CD -82 eighty-two CD -7 eleven CD -15 fifteen CD -50 fifty CD -58 fifty-eight CD -55 fifty-five CD -54 fifty-four CD -59 fifty-nine CD -51 fifty-one CD -57 fifty-seven CD -56 fifty-six CD -53 fifty-three CD -52 fifty-two CD -5 five CD -40 forty CD -48 forty-eight CD -45 forty-five CD -44 forty-four CD -49 forty-nine CD -41 forty-one CD -47 forty-seven CD -46 forty-six CD -43 forty-three CD -42 forty-two CD -4 four CD -14 fourteen CD -9 nine CD -19 nineteen CD -90 ninety CD -98 ninety-eight CD -95 ninety-five CD -94 ninety-four CD -99 ninety-nine CD -91 ninety-one CD -97 ninety-seven CD -96 ninety-six CD -93 ninety-three CD -92 ninety-two CD -1 one CD -1,000 one-hundred CD -1000 one-hundred CD -7 seven CD -17 seventeen CD -70 seventy CD -78 seventy-eight CD -75 seventy-five CD -74 seventy-four CD -79 seventy-nine CD -71 seventy-one CD -77 seventy-seven CD -76 seventy-six CD -73 seventy-three CD -72 seventy-two CD -6 six CD -16 sixteen CD -60 sixty CD -68 sixty-eight CD -65 sixty-five CD -64 sixty-four CD -69 sixty-nine CD -61 sixty-one CD -67 sixty-seven CD -66 sixty-six CD -63 sixty-three CD -62 sixty-two CD -10 ten CD -13 thirteen CD -30 thirty CD -38 thirty-eight CD -35 thirty-five CD -34 thirty-four CD -39 thirty-nine CD -31 thirty-one CD -37 thirty-seven CD -36 thirty-six CD -33 thirty-three CD -32 thirty-two CD -3 three CD -12 twelve CD -20 twenty CD -28 twenty-eight CD -25 twenty-five CD -24 twenty-four CD -29 twenty-nine CD -21 twenty-one CD -27 twenty-seven CD -26 twenty-six CD -23 twenty-three CD -22 twenty-two CD -2 two CD From 647e6cd4d551729d07fa39a77dfafd8def7b1be9 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 11 Apr 2016 14:41:14 +0200 Subject: [PATCH 35/82] Correct default token status for digit numbers Add a tva test to verify it. --- .../eng/tokenizerAutomaton-eng.tok | 2 +- .../data/test-eng.default.xml | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok index 60118658a..412856c09 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok @@ -72,7 +72,7 @@ - c_M / SINGLE_UPPER (T_ALPHA,T_CAPITAL_1ST) - c_m / ALL_LOWER (T_ALPHA,T_SMALL) - c_5 / INTEGER (T_NUMERIC,T_INTEGER) - - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC) + - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC,T_INTEGER) - m_pattern m_pattern / PATTERN (T_PATTERN) - c_lowline / START - c_other / START diff --git a/lima_linguisticprocessing/data/test-eng.default.xml b/lima_linguisticprocessing/data/test-eng.default.xml index 4d15f9838..3594aeb78 100644 --- a/lima_linguisticprocessing/data/test-eng.default.xml +++ b/lima_linguisticprocessing/data/test-eng.default.xml @@ -195,4 +195,24 @@ operator="=" right="3m55"/> + + The negative numbers in digits -27 and -25 must be normalized into themselves. + + + + + + + + + + From 4b1a0442d7978ac910249e5e708651b35e8dcf35 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 12 Apr 2016 14:36:11 +0200 Subject: [PATCH 36/82] Better config for SRL From a96f79d216c0a8046c34cc3d7e98b88faf024871 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 12 Apr 2016 15:27:25 +0200 Subject: [PATCH 37/82] Change the number of cols in SRL output Set this number as a define to ease up its later change. From 72f151de6415f26fae810b1ad8212cb49adb048b Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 13 Apr 2016 15:40:15 +0200 Subject: [PATCH 38/82] Correct interaction with external python srl --- .../core/AnalysisDumpers/ConllDumper.cpp | 75 +++++++++-- .../SemanticAnalysis/ConstraintFunction.cpp | 6 + .../SemanticRoleLabelingLoader.cpp | 54 +++++--- .../SemanticRoleLabelingLoader.h | 10 +- .../s/SemanticRoleLabelingLoader.cpp | 116 ------------------ .../s/SemanticRoleLabelingLoader.h | 97 --------------- 6 files changed, 113 insertions(+), 245 deletions(-) delete mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp delete mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.h diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp index aacae883f..53c4bad8c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp @@ -202,7 +202,9 @@ void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructur LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const { DUMPERLOGINIT; +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process"; +#endif LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); if (metadata == 0) { @@ -241,7 +243,9 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const std::vector::iterator sbItr=(sd->getSegments().begin()); uint64_t nbSentences((sd->getSegments()).size()); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; +#endif LinguisticGraphVertex sentenceBegin = sbItr->getFirstVertex(); LinguisticGraphVertex sentenceEnd = sbItr->getLastVertex(); @@ -268,7 +272,9 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const std::mapsegmentationMapping;//mapping the two types of segmentations (Lima and conll) std::mapsegmentationMappingReverse; +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process begin - end: " << sentenceBegin << " - " << sentenceEnd; +#endif //LinguisticGraphOutEdgeIt outItr,outItrEnd; QQueue toVisit; QSet visited; @@ -279,32 +285,46 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const { v = toVisit.dequeue(); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process Vertex index : " << v; +#endif visited.insert(v); segmentationMapping.insert(std::make_pair(v,tokenId)); segmentationMappingReverse.insert(std::make_pair(tokenId,v)); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process conll id : " << tokenId << " Lima id : " << v; +#endif DependencyGraphVertex dcurrent = syntacticData->depVertexForTokenVertex(v); DependencyGraphOutEdgeIt dit, dit_end; boost::tie(dit,dit_end) = boost::out_edges(dcurrent,*depGraph); for (; dit != dit_end; dit++) { +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process Dumping dependency edge " << (*dit).m_source << " -> " << (*dit).m_target; +#endif try { CEdgeDepRelTypePropertyMap typeMap = get(edge_deprel_type, *depGraph); SyntacticRelationId type = typeMap[*dit]; std::string syntRelName=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getSyntacticRelationName(type); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process relation = " << syntRelName; LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); +#endif LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); +#endif LinguisticGraphVertex dest = syntacticData->tokenVertexForDepVertex(boost::target(*dit, *depGraph)); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process Targ : Morph vertex= " << dest; +#endif if (syntRelName!="") { +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process saving target for" << v << ":" << dest << syntRelName; +#endif vertexDependencyInformations.insert(std::make_pair(v, std::make_pair(dest,syntRelName))); } } @@ -313,8 +333,10 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const } catch (...) { +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process: catch others....."; - throw; +#endif + throw; } } if (v == sentenceEnd) @@ -345,7 +367,10 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const // get the list of predicates for the current sentence QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; +#endif + QList< LinguisticGraphVertex > keys = predicates.keys(); toVisit.enqueue(sentenceBegin); tokenId=0; @@ -356,12 +381,16 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const Token* ft=get(vertex_token,*graph,v); MorphoSyntacticData* morphoData=get(vertex_data,*graph, v); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process PosGraph token" << v; +#endif if( morphoData!=0 && !morphoData->empty() && ft != 0) { const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process graphTag:" << micro; +#endif std::string inflectedToken=ft->stringForm().toUtf8().constData(); std::string lemmatizedToken; @@ -397,12 +426,18 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const if (vertexDependencyInformations.count(v)!=0) { LinguisticGraphVertex target=vertexDependencyInformations.find(v)->second.first; +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; +#endif targetConllId=segmentationMapping.find(target)->second; +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; +#endif QString relName = QString::fromUtf8(vertexDependencyInformations.find(v)->second.second.c_str()); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process the lima dependency tag for " << ft->stringForm()<< " is " << relName; +#endif if (m_d->m_conllLimaDepMapping.contains(relName)) { conllRelName=m_d->m_conllLimaDepMapping[relName]; @@ -442,11 +477,11 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const if (!predicates.isEmpty()) { dstream->out() << "\t"; - LDEBUG << "ConllDumper::process output the predicate if any"; +// LDEBUG << "ConllDumper::process output the predicate if any"; if (!predicates.contains(v)) { // No predicate for this token - dstream->out() << "-"; + dstream->out() << "_"; } else { @@ -454,26 +489,32 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const QString predicateAnnotation = annotationData->stringAnnotation(predicates.value(v),"Predicate"); dstream->out() << predicateAnnotation; } - // Now output the roles supported by the current PoS graph token - LDEBUG << "ConllDumper::process output the roles for the" << predicates.keys().size() << "predicates"; - for (int i = 0; i < predicates.keys().size(); i++) + // Now output the roles supported by the current PoS graph token +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process output the roles for the" << keys.size() << "predicates"; +#endif + for (int i = 0; i < keys.size(); i++) { // There will be one column for each predicate. Output the // separator right now dstream->out() << "\t"; - AnnotationGraphVertex predicateVertex = predicates.value(predicates.keys()[i]); + AnnotationGraphVertex predicateVertex = predicates.value(keys[keys.size()-1-i]); std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); if (vMatches.empty()) { - LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '-'."; - dstream->out() << "-"; +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '_'."; +#endif + dstream->out() << "_"; } else { +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process there is"<out() << "-"; +// dstream->out() << "_"; } } - if (roleAnnotation != "-") break; + if (roleAnnotation != "_") break; } dstream->out() << roleAnnotation.toUtf8().constData(); } @@ -508,15 +549,21 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const { continue; } +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process look at out edges of" << v; +#endif LinguisticGraphOutEdgeIt outIter,outIterEnd; for (boost::tie(outIter,outIterEnd) = boost::out_edges(v,*graph); outIter!=outIterEnd; outIter++) { LinguisticGraphVertex next = boost::target(*outIter,*graph); +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process looking out vertex" << next; +#endif if (!visited.contains(next)) { +#ifdef DEBUG_LP LDEBUG << "ConllDumper::process enqueuing" << next; +#endif visited.insert(next); toVisit.enqueue(next); } @@ -555,7 +602,9 @@ QMultiMap ConllDumperPrivate::coll while (v!=sentenceEnd && !toVisit.empty()) { v = toVisit.dequeue(); +#ifdef DEBUG_LP LDEBUG << "ConllDumperPrivate::collectPredicateTokens vertex:" << v; +#endif visited.insert(v); std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); @@ -564,7 +613,9 @@ QMultiMap ConllDumperPrivate::coll AnnotationGraphVertex vMatch = *it; if (annotationData->hasStringAnnotation(vMatch,"Predicate")) { +#ifdef DEBUG_LP LDEBUG << "ConllDumperPrivate::collectPredicateTokens insert" << v << vMatch; +#endif result.insert(v, vMatch); } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp index 159c8c4e6..c969eaaf4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp @@ -70,7 +70,9 @@ ConstraintFunction(language,complement) bool ClearSemanticRelation::operator()(AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "ClearSemanticRelation::operator()"; +#endif SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) { @@ -91,7 +93,9 @@ SaveSemanticRelation::SaveSemanticRelation(MediaId language, bool SaveSemanticRelation::operator()(AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "SaveSemanticRelation::operator()"; +#endif SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) { @@ -117,7 +121,9 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& anagraph, AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "CreateSemanticRelation::operator()" << vertex1 << vertex2 << m_semanticRelationType; +#endif LIMA_UNUSED(anagraph); SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index 4a933dfa3..ba6fb7279 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -19,9 +19,10 @@ /************************************************************************ * * @file SemanticRoleLabelingLoader.cpp - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar * @date 2014 - * copyright Copyright (C) 2014 by CEA LIST + * copyright Copyright (C) 2014-2016 by CEA LIST ***********************************************************************/ #include "SemanticRoleLabelingLoader.h" @@ -43,6 +44,7 @@ #include #include +#include #include #include @@ -64,8 +66,9 @@ namespace SemanticAnalysis { SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); -#define NBCOLSINSRLBEFOREFRAME 10 - +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "\n+" +#define CONLLFIELDSEPARATOR "\t" // Conll handler struct ConllHandler @@ -213,8 +216,11 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co { int sentenceIndex=it->first; QString sentence=it->second; - if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping,sentence)){ + if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping, sentence)) + { +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process there is/are " << cHandler.m_verbalClassNb << "verbal class(es) for this sentence " ; +#endif for (int vClassIndex=0;vClassIndexannotate(annotPredicateVertex, "Predicate", verbalClass); +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process: annotation vertex"<< annotPredicateVertex <<"was created for the verbal class "<< annotationData->stringAnnotation(annotPredicateVertex, "Predicate") << "and the PoS graph vertex"<>::iterator semRoleIt; for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; @@ -236,7 +244,9 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co annotationData->addMatching("PosGraph", posGraphRoleVertex, "annot", annotRoleVertex); +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process: annotation edge" << roleEdge << "annotated " << annotationData->stringAnnotation(roleEdge, "SemanticRole")<< "was created for" << verbalClass << " and the PoS graph vertices " << posGraphPredicateVertex << "and" << posGraphRoleVertex ; +#endif } } } @@ -250,8 +260,8 @@ ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, Linguist m_language(language), m_analysis(analysis), m_graph(graph), -m_descriptorSeparator("\t+"), -m_tokenSeparator("\n+"), +m_descriptorSeparator(CONLLFIELDSEPARATOR), +m_tokenSeparator(CONLLTOKENSEPARATOR), m_verbalClasses(), m_semanticRoles(), m_verbalClassNb() @@ -269,29 +279,36 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); QString firstSentenceToken=(*sentenceTokens.constBegin()); int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); - m_verbalClassNb = descriptorsNb -NBCOLSINSRLBEFOREFRAME; + m_verbalClassNb = descriptorsNb - NBCOLSINSRLBEFOREFRAME - 1; int classIndex=0; if (m_verbalClassNb > 0) { +#ifdef DEBUG_LP LDEBUG << "ConllHandler::extractSemanticInformation" << m_verbalClassNb << sentenceI << " : \n" << sent ; +#endif m_verbalClasses.clear(); m_verbalClasses.resize(m_verbalClassNb); m_semanticRoles.clear(); m_semanticRoles.resize(m_verbalClassNb); //repeated on each token of the sentence, that is on each line - for (QStringList::const_iterator tokensIterator = sentenceTokens.constBegin(); tokensIterator != sentenceTokens.constEnd(); - ++tokensIterator) + for (const auto & token: sentenceTokens) { int roleNumbers=0; - QStringList descriptors=cHandler.splitSegment((*tokensIterator),m_descriptorSeparator); + QStringList descriptors=cHandler.splitSegment(token,m_descriptorSeparator); if (descriptors.size()>=NBCOLSINSRLBEFOREFRAME+m_verbalClassNb) { int conllTokenId=descriptors[0].toInt(); QString conllToken=descriptors[1]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation token " << conllTokenId << conllToken; +#endif if(descriptors[NBCOLSINSRLBEFOREFRAME]!="_") { QString verbalClass=descriptors[NBCOLSINSRLBEFOREFRAME]; QString vClass=descriptors[NBCOLSINSRLBEFOREFRAME]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation verbalClass" << vClass; +#endif LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if (classIndex >= m_verbalClasses.size()) { @@ -301,22 +318,27 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap m_verbalClasses[classIndex]=qMakePair(limaTokenId, vClass); classIndex++; } - for (int roleTargetFieldIndex=0; roleTargetFieldIndex= descriptors.size()) { LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; break; } - if (descriptors[NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex]!="_") + if (descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]!="_") { - QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex]; + QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]; LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if(limaTokenId!=0) { - LDEBUG << "ConllHandler::extractSemanticInformation The PoS graph token id matching the conll token id " << conllTokenId << " is " << limaTokenId; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation argument "<> sRoles; if (roleTargetFieldIndex >= m_semanticRoles.size()) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h index 7ca381c95..d78791565 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h @@ -18,9 +18,10 @@ */ /** * @file SemanticRoleLabelingLoader.h - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar * @date 2014-04-17 - * copyright Copyright (C) 2014 by CEA LIST + * copyright Copyright (C) 2014-2016 by CEA LIST */ #ifndef SEMANTICROLELABELINGLOADER_H @@ -38,7 +39,8 @@ class SemanticRoleLabelingLoaderPrivate; /** * @brief A Semantic Role Labeling Loader class - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar */ class SemanticRoleLabelingLoader : public AnalysisLoader { @@ -52,7 +54,7 @@ class SemanticRoleLabelingLoader : public AnalysisLoader LimaStatusCode process(AnalysisContent& analysis) const; private: - SemanticRoleLabelingLoaderPrivate* m_d;; + SemanticRoleLabelingLoaderPrivate* m_d; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp deleted file mode 100644 index 28ca49dff..000000000 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* -Copyright 2002-2014 CEA LIST - -This file is part of LIMA. - -LIMA is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -LIMA is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with LIMA. If not, see -*/ -/************************************************************************ -* -* @file SemanticRoleLabelingLoader.cpp -* @author Clémence Filmont -* @date 2014-- -* copyright Copyright (C) 2014 by CEA LIST -* Project mm_linguisticprocessing -* -* -***********************************************************************/ - -#include "SemanticRoleLabelingLoader.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/Data/strwstrtools.h" -#include "linguisticProcessing/core/Automaton/recognizerMatch.h" -#include "linguisticProcessing/core/Automaton/recognizerData.h" -#include "common/MediaticData/mediaticData.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" -#include -#include "QStringList" - -using namespace std; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::ApplyRecognizer; -using namespace Lima::Common::XMLConfigurationFiles; - - - -namespace Lima { -namespace LinguisticProcessing { -namespace SemanticAnalysis { - -SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); - - -//*********************************************************************** -SemanticRoleLabelingLoader::SemanticRoleLabelingLoader(): -m_language(0), -m_graph("AnalysisGraph"), -m_suffix(".conll") -{} - -SemanticRoleLabelingLoader::~SemanticRoleLabelingLoader() -{ -} - -//*********************************************************************** - -void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager){ - - PROCESSORSLOGINIT; - m_language=manager->getInitializationParameters().media; - AnalysisLoader::init(unitConfiguration,manager); - try - { - m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); - } - catch (NoSuchParam& ) {} // keep default value - AnalysisLoader::init(unitConfiguration,manager); - } - - - LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) const{ - QFile file("/home/clemence/textes_test/jamaica_out.conll"); - } - - -SemanticRoleLabelingLoader::ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph): -m_tagIndex(), -m_language(language), -m_analysis(analysis), -m_graph(graph), -m_position(0), -m_length(0), -m_type(), -m_string(), -m_currentElement() -{ - PROCESSORSLOGINIT; - LDEBUG << "SemanticRoleLabelingLoader::ConllHandler constructor"; -} -SemanticRoleLabelingLoader::ConllHandler::~ConllHandler(){} - -// repeated on each line beginning -bool extractSemanticRole(const QString & tokenDescription) -{ - QStringList descriptors; - descriptors=tokenDescription.split(QRegExp("\\t+")); -// cout << descriptors[11]< -*/ -/************************************************************************ -* -* @file SemanticRoleLabelingLoader.h -* @author Clémence Filmont -* @date 2014-04-17 -* copyright Copyright (C) 2014 by CEA LIST -* Project mm_linguisticprocessing -* -* @brief an Semantic Role Labeling Loader class -* -* -***********************************************************************/ - -#ifndef SEMANTICROLELABELINGLOADER_H -#define SEMANTICROLELABELINGLOADER_H - -#include "linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" -#include -#include -#include - -namespace Lima { -namespace LinguisticProcessing { -namespace SemanticAnalysis { - -#define SEMANTICROLELABELINGLOADER_CLASSID "SemanticRoleLabelingLoader" - -class SemanticRoleLabelingLoader : public AnalysisLoader -{ - public: - SemanticRoleLabelingLoader(); - virtual ~SemanticRoleLabelingLoader(); - - void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager); - - LimaStatusCode process(AnalysisContent& analysis) const; - - private: - MediaId m_language; - std::string m_graph; - std::string m_suffix; -// QXmlSimpleReader* m_parser; /*< XML parser for the loader*/ - - // XML handler - class ConllHandler - { - public: - QMap m_tagIndex; - - ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph); - virtual ~ConllHandler(); - - - bool extractSemanticRole(const QString & expectedRole);// repeated on each line beginning - - - - private: - MediaId m_language; - AnalysisContent& m_analysis; - LinguisticAnalysisStructure::AnalysisGraph* m_graph; - uint64_t m_position; - uint64_t m_length; - std::string m_type; - std::string m_string; - std::string m_currentElement; - - }; - -}; - -} // end namespace -} // end namespace -} // end namespace - -#endif \ No newline at end of file From 19e0e367ac2fe8b2ef79180523e192f77f74935f Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 25 Apr 2016 23:51:45 +0200 Subject: [PATCH 39/82] Add a process unit with python srl embedded This is to replace the completely external process unit which takes a lot of time to initialize each time it is called. --- lima_linguisticprocessing/CMakeLists.txt | 10 + .../conf/lima-lp-eng.xml | 9 + .../core/SemanticAnalysis/CMakeLists.txt | 32 +- .../KnowledgeBasedSemanticRoleLabeler.cpp | 308 ++++++++++++++++++ .../KnowledgeBasedSemanticRoleLabeler.h | 74 +++++ lima_linguisticprocessing/test/CMakeLists.txt | 14 + lima_linguisticprocessing/test/srl.cpp | 122 +++++++ 7 files changed, 567 insertions(+), 2 deletions(-) create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h create mode 100644 lima_linguisticprocessing/test/srl.cpp diff --git a/lima_linguisticprocessing/CMakeLists.txt b/lima_linguisticprocessing/CMakeLists.txt index 4a7487972..b61fb2c20 100644 --- a/lima_linguisticprocessing/CMakeLists.txt +++ b/lima_linguisticprocessing/CMakeLists.txt @@ -118,6 +118,16 @@ if ("${QT_DBUSCPP2XML_EXECUTABLE}" STREQUAL "QT_DBUSCPP2XML_EXECUTABLE-NOTFOUND" message(WARNING "Dbus tools not found ${QT_DBUSCPP2XML_EXECUTABLE}") endif() +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + install(FILES ${QtNetwork_location} + ${QtXmlPatterns_location} + DESTINATION ${LIB_INSTALL_DIR}) +endif () + +find_package(PythonLibs 3.4 REQUIRED) +include_directories(${PYTHON_INCLUDE_DIRS}) +link_directories(${PYTHON_LIBRARIES}) + #add_definitions( -DBOOST_ALL_NO_LIB ) add_definitions( -DBOOST_ALL_DYN_LINK ) set( Boost_USE_STATIC_LIBS OFF ) diff --git a/lima_linguisticprocessing/conf/lima-lp-eng.xml b/lima_linguisticprocessing/conf/lima-lp-eng.xml index 6c46f1503..34a64fa0e 100644 --- a/lima_linguisticprocessing/conf/lima-lp-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-eng.xml @@ -711,6 +711,15 @@ + + + + + + + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt index af63988ed..3c169c243 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt @@ -21,12 +21,40 @@ add_definitions(-DLIMA_SEMANTICANALYSIS_EXPORTING) ########### next target ############### SET(lima-lp-semanticanalysis_LIB_SRCS - ConstraintFunction.cpp SemanticRelationAnnotation.cpp SemanticRelationData.cpp SemanticRoleLabelingLoader.cpp LimaConllTokenIdMapping.cpp + ConstraintFunction.cpp + KnowledgeBasedSemanticRoleLabeler.cpp + SemanticRelationAnnotation.cpp + SemanticRelationData.cpp + SemanticRoleLabelingLoader.cpp + LimaConllTokenIdMapping.cpp + SemanticRelationsXmlLogger.cpp ) add_library(lima-lp-semanticanalysis SHARED ${lima-lp-semanticanalysis_LIB_SRCS}) -target_link_libraries(lima-lp-semanticanalysis lima-common-factory lima-common-misc lima-common-data lima-common-fsaaccess lima-common-mediaticdata lima-common-time lima-common-factory lima-common-xmlconfigurationfiles lima-common-processunitframework lima-common-mediaprocessors lima-lp-linguisticprocessors lima-lp-linguisticresources lima-lp-annotationgraph lima-lp-linguisticanalysisstructure lima-lp-textsegmentation lima-lp-syntacticanalysis lima-lp-automaton lima-lp-applyrecognizer +target_link_libraries(lima-lp-semanticanalysis + lima-common-factory + lima-common-misc + lima-common-data + lima-common-fsaaccess + lima-common-mediaticdata + lima-common-time + lima-common-factory + lima-common-xmlconfigurationfiles + lima-common-processunitframework + lima-common-mediaprocessors + lima-lp-linguisticprocessors + lima-lp-linguisticresources + lima-lp-annotationgraph + lima-lp-linguisticanalysisstructure + lima-lp-textsegmentation + lima-lp-syntacticanalysis + lima-lp-automaton + lima-lp-applyrecognizer + ${optionalLibs} + ${Boost_LIBRARIES} + ${QT_LIBRARIES} + ${PYTHON_LIBRARY} ) set_target_properties(lima-lp-semanticanalysis PROPERTIES VERSION ${LIMA_LP_LIB_VERSION} SOVERSION ${LIMA_LP_LIB_SOVERSION}) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp new file mode 100644 index 000000000..e5de85f07 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -0,0 +1,308 @@ +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "KnowledgeBasedSemanticRoleLabeler.h" + +#include "common/Data/LimaString.h" +#include "common/misc/Exceptions.h" +#include "common/Data/strwstrtools.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/AnalysisDumpers/ConllDumper.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" + +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::AnalysisDumpers; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::Misc; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +static SimpleFactory knowledgeBasedSemanticRoleLabelerFactory(KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID); + + +class KnowledgeBasedSemanticRoleLabelerPrivate +{ +public: + KnowledgeBasedSemanticRoleLabelerPrivate(); + virtual ~KnowledgeBasedSemanticRoleLabelerPrivate(); + + PyObject* m_instance; + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_inputSuffix; + QString m_outputSuffix; +}; + +KnowledgeBasedSemanticRoleLabelerPrivate::KnowledgeBasedSemanticRoleLabelerPrivate() : + m_instance(0), + m_dumper(new ConllDumper()) +{} + +KnowledgeBasedSemanticRoleLabelerPrivate::~KnowledgeBasedSemanticRoleLabelerPrivate() +{ +} + +KnowledgeBasedSemanticRoleLabeler::KnowledgeBasedSemanticRoleLabeler() : m_d(new KnowledgeBasedSemanticRoleLabelerPrivate()) +{} + + +KnowledgeBasedSemanticRoleLabeler::~KnowledgeBasedSemanticRoleLabeler() +{ + delete m_d; +} + +void KnowledgeBasedSemanticRoleLabeler::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ +#ifdef DEBUG_LP + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler::init"; +#endif + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_d->m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'dumper' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_d->m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_d->m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'loader' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + try { + m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + QString path; + QString mode = "VerbNet"; + QString kbsrlLogLevel = "error"; + + try + { + kbsrlLogLevel = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("loglevel").c_str()); + } + catch (NoSuchParam& ) + { + // keep default + } + + try + { + path = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("path").c_str()); + } + catch (NoSuchParam& ) + { + SEMANTICANALYSISLOGINIT; + LERROR << "no param 'path' in KnowledgeBasedSemanticRoleLabeler group configuration"; + throw InvalidConfiguration(); + } + + try + { + mode = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("mode").c_str()); + if (mode != "VerbNet" && mode != "FrameNet") + { + SEMANTICANALYSISLOGINIT; + LERROR << "Unknown semantic annotation mode" << mode; + throw InvalidConfiguration(); + } + } + catch (NoSuchParam& ) + { + // keep default + } + + // Initialize the python SRL system + /* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); + + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + if (sys_module == NULL) + { + LERROR << "Failed to import the sys module"; + PyErr_Print(); + } + PyObject* sys_dict = PyModule_GetDict(sys_module); + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("/home/gael/Projets/knowledgesrl/src")) == -1) + { + LERROR << "Failed to append to python path"; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeller module + PyObject* semanticrolelabeller_module = PyImport_ImportModule("semanticrolelabeller"); + if (semanticrolelabeller_module == NULL) + { + LERROR << "Failed to import srl semanticrolelabeller module"; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + m_d->m_instance = PyObject_CallMethod(semanticrolelabeller_module, "SemanticRoleLabeller", "[s]", QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData()); + if (m_d->m_instance == NULL) + { + LERROR << "Cannot instantiate the SemanticRoleLabeller python class"; + PyErr_Print(); + Py_Exit(1); + } +} + +LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( + AnalysisContent& analysis) const +{ + TimeUtilsController knowledgeBasedSemanticRoleLabelerProcessTime("KnowledgeBasedSemanticRoleLabeler"); + SEMANTICANALYSISLOGINIT; + LINFO << "start SRL process"; + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + + // Use CoNLL duper to produce the input to the SRL + LimaStatusCode returnCode(SUCCESS_ID); + returnCode=m_d->m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to dump data to temporary file"; + return returnCode; + } + + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename, outputFilename; + if (!m_d->m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_d->m_inputSuffix; + } + QFile inputFile(inputFilename); + inputFile.open(QIODevice::ReadOnly); + QString conllInput = QString::fromUtf8(inputFile.readAll().constData()); + inputFile.close(); + if (!m_d->m_outputSuffix.isEmpty()) + { + outputFilename = fileName + m_d->m_outputSuffix; + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "s", conllInput.toUtf8().constData()); + if (callResult == NULL) + { + LERROR << "Failed to call the annotate method"; + PyErr_Print(); + Py_Exit(1); + } + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + LERROR << "Cannot convert result item to string"; + PyErr_Print(); + Py_Exit(1); + } + LDEBUG << "Python result is:" << result; + QFile outputFile(outputFilename); + outputFile.open(QIODevice::WriteOnly); + outputFile.write(result); + outputFile.close(); + + // Import the CoNLL result + returnCode=m_d->m_loader->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file"; + return returnCode; + } + + + return returnCode; +} + +} //namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h new file mode 100644 index 000000000..f8927c377 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h @@ -0,0 +1,74 @@ +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#ifndef LIMA_LINGUISTICPROCESSING_SEMANTICANALYSIS_KNOWLEDGEBASEDSEMANTICROLELABELER_H +#define LIMA_LINGUISTICPROCESSING_SEMANTICANALYSIS_KNOWLEDGEBASEDSEMANTICROLELABELER_H + +#include "SemanticAnalysisExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +#define KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID "KnowledgeBasedSemanticRoleLabeler" + +class KnowledgeBasedSemanticRoleLabelerPrivate; + +/** @brief This is a @ref MediaProcessUnit which do semantic role labeling using the knowledge-based + * SRL in python made by Quentin Pradet during his PhD thesis + * + * As a ProcessUnit, it has an init and a process function. See @ref ProcessUnit for details. + * + * IOPES: + * - Input: an AnalysisContent and the following parameters in the configuration file: + * - debug: whether the debug option of the python module should be activated or not + * - path: the path to the knowledgesrl python package + * - mode: the semantic model to use to annotate. Either VerbNet (default) or FrameNet. + * - Output: an AnalysisContent + * - Preconditions: the AnalysisContent must the result of the syntactic analysis + * - Effects: the annotation graph will be updated with SRL annotations. + */ +class LIMA_SEMANTICANALYSIS_EXPORT KnowledgeBasedSemanticRoleLabeler : public MediaProcessUnit +{ + +public: + KnowledgeBasedSemanticRoleLabeler(); + virtual ~KnowledgeBasedSemanticRoleLabeler(); + + void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& analysis) const; + + private: + + KnowledgeBasedSemanticRoleLabelerPrivate* m_d; +}; + +} // namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima + +#endif diff --git a/lima_linguisticprocessing/test/CMakeLists.txt b/lima_linguisticprocessing/test/CMakeLists.txt index b67fe0456..7f1ab9fbd 100644 --- a/lima_linguisticprocessing/test/CMakeLists.txt +++ b/lima_linguisticprocessing/test/CMakeLists.txt @@ -64,6 +64,20 @@ install(TARGETS analyzeText DESTINATION bin) ########### next target ############### +SET(srl_SRCS + srl.cpp +) + +add_executable(srl ${srl_SRCS}) +target_link_libraries(srl + ${PYTHON_LIBRARY} + ${QT_LIBRARIES} +) + +install(TARGETS srl DESTINATION bin) + +########### next target ############### + # SET(threadedAnalyzeText_SRCS # threadedAnalyzeText.cpp # ) diff --git a/lima_linguisticprocessing/test/srl.cpp b/lima_linguisticprocessing/test/srl.cpp new file mode 100644 index 000000000..4275f531a --- /dev/null +++ b/lima_linguisticprocessing/test/srl.cpp @@ -0,0 +1,122 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include +#include +#include + +#if PY_MAJOR_VERSION < 3 +#error "Python version must be 3 or more" +#endif + +int main(int argc, char **argv) +{ +std::string text = "1 The the DET DT _ _ 2 NMOD _ _\n" +"2 Convention Convention NP NNP _ _ 4 SUB _ _\n" +"3 also also ADV RB _ _ 4 VMOD _ _\n" +"4 established establish V VBD _ _ _ _ _ _\n" +"5 eleven eleven NOMBRE CD Numex.NUMBER _ _ _ _ _\n" +"6 Working working ADJ JJ _ _ 7 NMOD _ _\n" +"7 Groups group NC NNS _ _ 4 OBJ _ _\n" +"8 and and CONJ CC _ _ _ _ _ _\n" +"9 three three NOMBRE CD Numex.NUMBER _ _ _ _ _\n" +"10 Discussion discussion NC NN _ _ 11 NMOD _ _\n" +"11 Circles circle NC NNS _ _ _ DEP _ _"; + +/* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); + + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + if (sys_module == NULL) + { + std::cerr << "Failed to import the sys module" << std::endl; + PyErr_Print(); + } + PyObject* sys_dict = PyModule_GetDict(sys_module); + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("/home/gael/Projets/knowledgesrl/src")) == -1) + { + std::cerr << "Failed to append to python path" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeller module + PyObject* semanticrolelabeller_module = PyImport_ImportModule("semanticrolelabeller"); + if (semanticrolelabeller_module == NULL) + { + std::cerr << "Failed to import srl semanticrolelabeller module" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + PyObject* instance = PyObject_CallMethod(semanticrolelabeller_module, "SemanticRoleLabeller", "[s]", "--log=debug"); + if (instance == NULL) + { + std::cerr << "Cannot instantiate the SemanticRoleLabeller python class" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(instance, "annotate", "s", text.c_str()); + if (callResult == NULL) + { + std::cerr << "Failed to call the annotate method" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + std::cerr << "Cannot convert result item to string" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + std::cout << "Python result is:" << std::endl << result; + Py_Finalize(); + + return 0; + +} From ab9d3216eb7d31107842ccf43dee4f048bfa9766 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 25 Apr 2016 23:57:31 +0200 Subject: [PATCH 40/82] Use QT_NO_KEYWORDS to avoid clash with Python lib Implies to use always Q_SLOTS, etc. instead of 'slots', etc. --- .../common/BagOfWords/tests/BagOfWordsTest2.h | 2 +- .../common/BagOfWords/tests/BagOfWordsTest3.h | 2 +- .../linguisticProcessing/common/tgv/TestCaseProcessor.cpp | 5 +++-- lima_pelf/benchmarkingTool/CommentEditDlg.h | 2 +- lima_pelf/benchmarkingTool/EvaluationResultDimension.h | 2 +- lima_pelf/benchmarkingTool/Pipeline.h | 2 +- lima_pelf/benchmarkingTool/PipelineConfigureDlg.h | 2 +- lima_pelf/benchmarkingTool/PipelineEditFileDlg.h | 2 +- lima_pelf/benchmarkingTool/PipelineWidget.h | 2 +- lima_pelf/resourceTool/DictionnaryEntryEditDlg.h | 2 +- lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h | 2 +- lima_pelf/resourceTool/ResourceEditorTableModel.h | 2 +- lima_pelf/resourceTool/ResourceEditorTableWidget.h | 2 +- lima_pelf/resourceTool/ResourceTool.h | 2 +- manageQt5.cmake | 1 + 15 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h index 1845327ec..3eee57311 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h @@ -6,7 +6,7 @@ class BagOfWordsTest2: public QObject { Q_OBJECT -private slots: +private Q_SLOTS: void initTestCase(); void test_indexElementDefaultConstructor(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h index 1770c0716..8219e1182 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h @@ -6,7 +6,7 @@ class BagOfWordsTest3: public QObject { Q_OBJECT -private slots: +private Q_SLOTS: void initTestCase(); // BoWText with a BoWTerm and a BoWToken diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp index ddc85a033..92688a679 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp @@ -125,12 +125,13 @@ TestCaseError TestCaseProcessor::evalTestCase( right.removeDuplicates(); QSet sleft; - foreach (QString element, left) + + Q_FOREACH (QString element, left) { sleft.insert(element); } QSet sright; - foreach (QString element, right) + Q_FOREACH (QString element, right) { sright.insert(element); } diff --git a/lima_pelf/benchmarkingTool/CommentEditDlg.h b/lima_pelf/benchmarkingTool/CommentEditDlg.h index 31d6ea3dc..8148992f2 100644 --- a/lima_pelf/benchmarkingTool/CommentEditDlg.h +++ b/lima_pelf/benchmarkingTool/CommentEditDlg.h @@ -40,7 +40,7 @@ Q_OBJECT CommentEditDlg (QWidget* parent = 0); void init (BenchmarkingResult* br, Pipeline* p = 0); -public slots: +public Q_SLOTS: void submit (); diff --git a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h index ebb7341ae..c45776f83 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h +++ b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h @@ -50,7 +50,7 @@ Q_OBJECT signals: void visibleChanged (); -public slots: +public Q_SLOTS: void updateVisibleChanged (int state); diff --git a/lima_pelf/benchmarkingTool/Pipeline.h b/lima_pelf/benchmarkingTool/Pipeline.h index 427d2c0df..c89008a2b 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.h +++ b/lima_pelf/benchmarkingTool/Pipeline.h @@ -81,7 +81,7 @@ Q_OBJECT Qt::DropActions supportedDropActions() const; Qt::ItemFlags flags(const QModelIndex &index) const; -public slots: +public Q_SLOTS: void unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers); diff --git a/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h b/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h index 8f32b6187..6f1af5c2a 100644 --- a/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h +++ b/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h @@ -43,7 +43,7 @@ Q_OBJECT PipelineConfigureDlg (QWidget* parent = 0); void init (Pipeline* p, QString workingDir, QString analyzerCmd, QString evaluatorCmd, int cp); -public slots: +public Q_SLOTS: void workingDirBrowse (); void analyzerCmdBrowse (); diff --git a/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h b/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h index e0de3ab28..ccb6ba46d 100644 --- a/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h +++ b/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h @@ -44,7 +44,7 @@ Q_OBJECT void init (Pipeline* bp); void setUnit (PipelineUnit* unit, int index); -public slots: +public Q_SLOTS: void textFileBrowse (); void referenceFileBrowse (); diff --git a/lima_pelf/benchmarkingTool/PipelineWidget.h b/lima_pelf/benchmarkingTool/PipelineWidget.h index 140352b56..29176463f 100644 --- a/lima_pelf/benchmarkingTool/PipelineWidget.h +++ b/lima_pelf/benchmarkingTool/PipelineWidget.h @@ -48,7 +48,7 @@ Q_OBJECT void keyPressEvent (QKeyEvent* event); void contextMenuEvent (QContextMenuEvent* event); -public slots: +public Q_SLOTS: void editUnit (const QModelIndex& index); void contextEdit (); diff --git a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h index 6094d75f8..481fd8355 100644 --- a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h +++ b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h @@ -44,7 +44,7 @@ Q_OBJECT signals: void updateEntry (AbstractResourceEntry* de); -public slots: +public Q_SLOTS: void submit (); void checkValidity(); diff --git a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h index 628630c4c..ab978d11e 100644 --- a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h +++ b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h @@ -44,7 +44,7 @@ Q_OBJECT signals: void updateEntry (AbstractResourceEntry* de); -public slots: +public Q_SLOTS: void submit (); void checkValidity(); diff --git a/lima_pelf/resourceTool/ResourceEditorTableModel.h b/lima_pelf/resourceTool/ResourceEditorTableModel.h index 6336b31f3..ca4c6232b 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableModel.h +++ b/lima_pelf/resourceTool/ResourceEditorTableModel.h @@ -42,7 +42,7 @@ Q_OBJECT void addEntry (); void deleteEntries (QModelIndexList indexList); -public slots: +public Q_SLOTS: void addEntry (AbstractResourceEntry* de); diff --git a/lima_pelf/resourceTool/ResourceEditorTableWidget.h b/lima_pelf/resourceTool/ResourceEditorTableWidget.h index 9baaedc76..b2b4dbb2c 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableWidget.h +++ b/lima_pelf/resourceTool/ResourceEditorTableWidget.h @@ -43,7 +43,7 @@ Q_OBJECT void keyPressEvent (QKeyEvent* event); void contextMenuEvent (QContextMenuEvent* event); -public slots: +public Q_SLOTS: void editEntry (const QModelIndex& index); void createEntry (); diff --git a/lima_pelf/resourceTool/ResourceTool.h b/lima_pelf/resourceTool/ResourceTool.h index 507beb6ef..17bbd15f2 100644 --- a/lima_pelf/resourceTool/ResourceTool.h +++ b/lima_pelf/resourceTool/ResourceTool.h @@ -63,7 +63,7 @@ Q_OBJECT bool popPelfSharedMemory (QString msg); void logDebugMsg (QtMsgType type, const char* msg); -public slots: +public Q_SLOTS: void dictionnaryEditDlg (AbstractResourceEntry* are = 0); void dictionnarySearch (); diff --git a/manageQt5.cmake b/manageQt5.cmake index 45166305d..8c0a883c7 100644 --- a/manageQt5.cmake +++ b/manageQt5.cmake @@ -22,6 +22,7 @@ set(CMAKE_PREFIX_PATH ) # Add definitions and flags +add_definitions(-DQT_NO_KEYWORDS) add_definitions(-DQT_DISABLE_DEPRECATED_BEFORE=0) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -lpthread") From 8e617d7d02dad7f70b7e5d677595b6561cdafb74 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 01:20:30 +0200 Subject: [PATCH 41/82] Allow to use temp files instead of suffix on input This change allow to create tempoary files and transmit its name in the analysis metadata to transfer written files from one process unit to another. Before, we had to use suffixes on the input filename, thus adding somtimes useless files in the source dir. The name of the file is stored in a metadata whose name is set in the process units configuration parameter 'temporaryFileMetadata'. --- .../AbstractTextualAnalysisDumper.cpp | 20 ++++- .../AbstractTextualAnalysisDumper.h | 1 + .../LinguisticProcessors/AnalysisLoader.cpp | 27 ++++++- .../LinguisticProcessors/AnalysisLoader.h | 1 + .../KnowledgeBasedSemanticRoleLabeler.cpp | 78 +++++++++++++++---- 5 files changed, 106 insertions(+), 21 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp index 7f0ea1d6d..9613f92d2 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp @@ -47,7 +47,8 @@ m_out(0), m_handlerName(), m_outputFile(), m_outputSuffix(), -m_append(false) +m_append(false), +m_temporaryFileMetadata() { } @@ -69,6 +70,12 @@ void AbstractTextualAnalysisDumper::init( } catch (NoSuchParam& ) { } // do nothing, optional + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + try { m_outputSuffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); @@ -119,6 +126,17 @@ initialize(AnalysisContent& analysis) const } } + if (! m_temporaryFileMetadata.isEmpty()) { +#ifdef DEBUG_LP + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile; +#endif + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + LERROR << "no LinguisticMetaData ! abort"; + } + return new DumperStream(metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()),m_append); + } + if (! m_outputFile.empty()) { #ifdef DEBUG_LP LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h index 617fdc48d..9438ab56a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h @@ -69,6 +69,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT AbstractTextualAnalysisDumper : public Me std::string m_outputFile; /* < the file name for local file logging */ std::string m_outputSuffix; /* < the suffix for local file logging */ bool m_append; + QString m_temporaryFileMetadata; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp index af07d5fb2..a2b1e11cb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp @@ -40,7 +40,8 @@ SimpleFactory AnalysisLoaderFactory(ANALYSISLOA AnalysisLoader::AnalysisLoader(): MediaProcessUnit(), m_inputFileName(), -m_inputFileExtension() +m_inputFileExtension(), +m_temporaryFileMetadata() { } @@ -56,6 +57,13 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc LDEBUG << "Initialization"; bool parameterFound(false); + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + try { m_inputFileName=unitConfiguration.getParamsValueAtKey("inputFile"); parameterFound=true; @@ -71,7 +79,7 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc } if (! parameterFound) { - LERROR << "No 'inputFile' or 'inputSuffix' parameter in AnalysisLoader"; + LERROR << "No 'inputFile' or 'inputSuffix' or 'temporaryFileMetadata' parameter in AnalysisLoader"; throw InvalidConfiguration(); } @@ -80,7 +88,20 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const { static std::string inputFile(""); - if (! m_inputFileName.empty()) { + if (! m_temporaryFileMetadata.isEmpty()) { + // get temporary filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'temporaryFileMetadata' parameter for AnalysisLoader"; + return inputFile; + } + + inputFile = metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); + return inputFile; + } + else if (! m_inputFileName.empty()) { return m_inputFileName; } else if (! m_inputFileExtension.empty()) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h index b46dddc44..f1cef4219 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h @@ -64,6 +64,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT AnalysisLoader : public MediaProcessUnit protected: std::string m_inputFileName; std::string m_inputFileExtension; + QString m_temporaryFileMetadata; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index e5de85f07..c56009a07 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -64,6 +64,7 @@ class KnowledgeBasedSemanticRoleLabelerPrivate const MediaProcessUnit* m_loader; QString m_inputSuffix; QString m_outputSuffix; + QString m_temporaryFileMetadata; }; KnowledgeBasedSemanticRoleLabelerPrivate::KnowledgeBasedSemanticRoleLabelerPrivate() : @@ -120,18 +121,29 @@ void KnowledgeBasedSemanticRoleLabeler::init( throw InvalidConfiguration(); } + try { + m_d->m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value (empty) + } + try { m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // optional parameter: keep default value + LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); } try { m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // optional parameter: keep default value + LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); } QString path; @@ -245,6 +257,16 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } + + QScopedPointer temporaryFile; + if (!m_d->m_temporaryFileMetadata.isEmpty()) + { + QScopedPointer otherTemp(new QTemporaryFile()); + temporaryFile.swap(otherTemp); + temporaryFile->open(); + metadata->setMetaData(m_d->m_temporaryFileMetadata.toUtf8().constData(), + temporaryFile->fileName().toUtf8().constData()); + } // Use CoNLL duper to produce the input to the SRL LimaStatusCode returnCode(SUCCESS_ID); @@ -254,19 +276,26 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( return returnCode; } - QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); - QString inputFilename, outputFilename; - if (!m_d->m_inputSuffix.isEmpty()) + QString conllInput; + + if (m_d->m_temporaryFileMetadata.isEmpty()) { - inputFilename = fileName+ m_d->m_inputSuffix; + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename; + if (!m_d->m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_d->m_inputSuffix; + } + QFile inputFile(inputFilename); + inputFile.open(QIODevice::ReadOnly); + conllInput = QString::fromUtf8(inputFile.readAll().constData()); + inputFile.close(); } - QFile inputFile(inputFilename); - inputFile.open(QIODevice::ReadOnly); - QString conllInput = QString::fromUtf8(inputFile.readAll().constData()); - inputFile.close(); - if (!m_d->m_outputSuffix.isEmpty()) + else { - outputFilename = fileName + m_d->m_outputSuffix; + temporaryFile->open(); + conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); + temporaryFile->close(); } // Run the semantic role labeller @@ -287,11 +316,26 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( Py_Exit(1); } LDEBUG << "Python result is:" << result; - QFile outputFile(outputFilename); - outputFile.open(QIODevice::WriteOnly); - outputFile.write(result); - outputFile.close(); - + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString outputFilename; + if (!m_d->m_outputSuffix.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + outputFilename = fileName + m_d->m_outputSuffix; + } + QFile outputFile(outputFilename); + outputFile.open(QIODevice::WriteOnly); + outputFile.write(result); + outputFile.close(); + } + else + { + temporaryFile->open(); + temporaryFile->seek(0); + temporaryFile->write(result); + temporaryFile->close(); + } // Import the CoNLL result returnCode=m_d->m_loader->process(analysis); if (returnCode!=SUCCESS_ID) { From 42eb5583ee9769d2f909fd036d99272edf11758a Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 01:31:36 +0200 Subject: [PATCH 42/82] Correct typo in symbols --- lima_linguisticprocessing/conf/lima-lp-eng.xml | 2 +- .../KnowledgeBasedSemanticRoleLabeler.cpp | 12 ++++++------ lima_linguisticprocessing/test/srl.cpp | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-eng.xml b/lima_linguisticprocessing/conf/lima-lp-eng.xml index 34a64fa0e..1201d3e96 100644 --- a/lima_linguisticprocessing/conf/lima-lp-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-eng.xml @@ -53,7 +53,7 @@ - + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index c56009a07..7faaef03d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -226,20 +226,20 @@ void KnowledgeBasedSemanticRoleLabeler::init( Py_Exit(1); } - // Import the semanticrolelabeller module - PyObject* semanticrolelabeller_module = PyImport_ImportModule("semanticrolelabeller"); - if (semanticrolelabeller_module == NULL) + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) { - LERROR << "Failed to import srl semanticrolelabeller module"; + LERROR << "Failed to import srl semanticrolelabeler module"; PyErr_Print(); Py_Exit(1); } // Create the semantic role labeller instance - m_d->m_instance = PyObject_CallMethod(semanticrolelabeller_module, "SemanticRoleLabeller", "[s]", QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData()); + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[s]", QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData()); if (m_d->m_instance == NULL) { - LERROR << "Cannot instantiate the SemanticRoleLabeller python class"; + LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; PyErr_Print(); Py_Exit(1); } diff --git a/lima_linguisticprocessing/test/srl.cpp b/lima_linguisticprocessing/test/srl.cpp index 4275f531a..65bcb2709 100644 --- a/lima_linguisticprocessing/test/srl.cpp +++ b/lima_linguisticprocessing/test/srl.cpp @@ -79,20 +79,20 @@ std::string text = "1 The the DET DT _ _ 2 NMOD _ _\n" Py_Exit(1); } - // Import the semanticrolelabeller module - PyObject* semanticrolelabeller_module = PyImport_ImportModule("semanticrolelabeller"); - if (semanticrolelabeller_module == NULL) + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) { - std::cerr << "Failed to import srl semanticrolelabeller module" << std::endl; + std::cerr << "Failed to import srl semanticrolelabeler module" << std::endl; PyErr_Print(); Py_Exit(1); } // Create the semantic role labeller instance - PyObject* instance = PyObject_CallMethod(semanticrolelabeller_module, "SemanticRoleLabeller", "[s]", "--log=debug"); + PyObject* instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[s]", "--log=debug"); if (instance == NULL) { - std::cerr << "Cannot instantiate the SemanticRoleLabeller python class" << std::endl; + std::cerr << "Cannot instantiate the SemanticRoleLabeler python class" << std::endl; PyErr_Print(); Py_Exit(1); } From 6ff93ca573120aec5e7e62c7ed81eb0ca35af614 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 01:38:03 +0200 Subject: [PATCH 43/82] Change remaining Qt keywords to macros --- lima_annoqt/src/kcolorbutton.cpp | 2 +- lima_annoqt/src/kcolorcombo.cpp | 4 ++-- lima_annoqt/src/kcolordialog.cpp | 22 +++++++++---------- lima_annoqt/src/kcolordialog.h | 4 ++-- lima_annoqt/src/kcolorvalueselector.cpp | 2 +- lima_annoqt/src/kxyselector.cpp | 4 ++-- lima_annoqt/src/specificEntity.cpp | 2 +- .../EvaluationResultDimension.cpp | 2 +- .../EvaluationResultDimension.h | 2 +- lima_pelf/benchmarkingTool/Pipeline.cpp | 20 ++++++++--------- lima_pelf/benchmarkingTool/Pipeline.h | 2 +- lima_pelf/benchmarkingTool/PipelineUnit.cpp | 2 +- lima_pelf/benchmarkingTool/ResultsWidget.cpp | 4 ++-- .../resourceTool/DictionnaryEntryEditDlg.cpp | 2 +- .../resourceTool/DictionnaryEntryEditDlg.h | 2 +- .../IdiomaticExpressionEntryEditDlg.cpp | 2 +- .../IdiomaticExpressionEntryEditDlg.h | 2 +- .../resourceTool/ResourceEditorTableModel.cpp | 4 ++-- .../ResourceEditorTableWidget.cpp | 4 ++-- .../resourceTool/ResourceEditorTableWidget.h | 2 +- .../resourceTool/ResourceReaderSimpleModel.h | 2 +- .../resourceTool/ResourceReaderTableModel.cpp | 2 +- .../resourceTool/ResourceReaderTableModel.h | 2 +- .../resourceTool/ResourceReaderToolBoxModel.h | 2 +- 24 files changed, 49 insertions(+), 49 deletions(-) diff --git a/lima_annoqt/src/kcolorbutton.cpp b/lima_annoqt/src/kcolorbutton.cpp index c5eb896d2..f6d36a780 100644 --- a/lima_annoqt/src/kcolorbutton.cpp +++ b/lima_annoqt/src/kcolorbutton.cpp @@ -106,7 +106,7 @@ void KColorButton::setColor( const QColor &c ) if ( d->col != c ) { d->col = c; repaint(); - emit changed( d->col ); + Q_EMIT changed( d->col ); } } diff --git a/lima_annoqt/src/kcolorcombo.cpp b/lima_annoqt/src/kcolorcombo.cpp index 52709f990..0b7875c07 100644 --- a/lima_annoqt/src/kcolorcombo.cpp +++ b/lima_annoqt/src/kcolorcombo.cpp @@ -334,7 +334,7 @@ void KColorComboPrivate::_k_slotActivated(int index) internalcolor = colorList[index - 1]; } - emit q->activated(internalcolor); + Q_EMIT q->activated(internalcolor); } void KColorComboPrivate::_k_slotHighlighted(int index) @@ -347,7 +347,7 @@ void KColorComboPrivate::_k_slotHighlighted(int index) internalcolor = colorList[index - 1]; } - emit q->highlighted(internalcolor); + Q_EMIT q->highlighted(internalcolor); } void KColorComboPrivate::addColors() diff --git a/lima_annoqt/src/kcolordialog.cpp b/lima_annoqt/src/kcolordialog.cpp index c31781314..215ee6c98 100644 --- a/lima_annoqt/src/kcolordialog.cpp +++ b/lima_annoqt/src/kcolordialog.cpp @@ -383,7 +383,7 @@ void KColorCells::mouseReleaseEvent(QMouseEvent *e) d->inMouse = false; if (cell != -1) - emit colorSelected(cell , color(cell)); + Q_EMIT colorSelected(cell , color(cell)); } QTableWidget::mouseReleaseEvent(e); @@ -394,7 +394,7 @@ void KColorCells::mouseDoubleClickEvent(QMouseEvent * /*e*/) int cell = positionToCell(d->mousePos); if (cell != -1) - emit colorDoubleClicked(cell , color(cell)); + Q_EMIT colorDoubleClicked(cell , color(cell)); } @@ -455,7 +455,7 @@ void KColorPatch::dropEvent(QDropEvent *event) QColor c = KColorMimeData::fromMimeData(event->mimeData()); if (c.isValid()) { setColor(c); - emit colorChanged(c); + Q_EMIT colorChanged(c); } } @@ -625,12 +625,12 @@ KColorTable::KColorTablePrivate::slotShowNamedColorReadError(void) // // 2000-02-12 Espen Sand -// Set the color in two steps. The setColors() slot will not emit a signal +// Set the color in two steps. The setColors() slot will not Q_EMIT a signal // with the current color setting. The reason is that setColors() is used // by the color selector dialog on startup. In the color selector dialog // we normally want to display a startup color which we specify // when the dialog is started. The slotSetColors() slot below will -// set the palette and then use the information to emit a signal with the +// set the palette and then use the information to Q_EMIT a signal with the // new color setting. It is only used by the combobox widget. // void @@ -729,7 +729,7 @@ KColorTable::KColorTablePrivate::slotColorCellSelected(int index , const QColor& { if (!mPalette || (index >= mPalette->count())) return; - emit q->colorSelected(mPalette->color(index), mPalette->name(index)); + Q_EMIT q->colorSelected(mPalette->color(index), mPalette->name(index)); } void @@ -737,14 +737,14 @@ KColorTable::KColorTablePrivate::slotColorCellDoubleClicked(int index , const QC { if (!mPalette || (index >= mPalette->count())) return; - emit q->colorDoubleClicked(mPalette->color(index), mPalette->name(index)); + Q_EMIT q->colorDoubleClicked(mPalette->color(index), mPalette->name(index)); } void KColorTable::KColorTablePrivate::slotColorTextSelected(const QString &colorText) { - emit q->colorSelected(m_namedColorMap[ colorText ], colorText); + Q_EMIT q->colorSelected(m_namedColorMap[ colorText ], colorText); } @@ -1175,7 +1175,7 @@ void KColorDialog::KColorDialogPrivate::slotDefaultColorClicked() } else { showColor(selColor, QString()); } - emit q->colorSelected(selColor); + Q_EMIT q->colorSelected(selColor); } void @@ -1465,10 +1465,10 @@ void KColorDialog::KColorDialogPrivate::_setColor(const QColor &color, const QSt showColor(selColor, name); - emit q->colorSelected(selColor); + Q_EMIT q->colorSelected(selColor); } -// show but don't set into selColor, nor emit colorSelected +// show but don't set into selColor, nor Q_EMIT colorSelected void KColorDialog::KColorDialogPrivate::showColor(const QColor &color, const QString &name) { bRecursion = true; diff --git a/lima_annoqt/src/kcolordialog.h b/lima_annoqt/src/kcolordialog.h index d3988cef9..297cc0a0b 100644 --- a/lima_annoqt/src/kcolordialog.h +++ b/lima_annoqt/src/kcolordialog.h @@ -226,8 +226,8 @@ class KColorSpinBox : public QSpinBox virtual void valueChange() { updateDisplay(); - emit valueChanged( value() ); - emit valueChanged( currentValueText() ); + Q_EMIT valueChanged( value() ); + Q_EMIT valueChanged( currentValueText() ); }*/ }; diff --git a/lima_annoqt/src/kcolorvalueselector.cpp b/lima_annoqt/src/kcolorvalueselector.cpp index 8a38c569c..ff525151a 100644 --- a/lima_annoqt/src/kcolorvalueselector.cpp +++ b/lima_annoqt/src/kcolorvalueselector.cpp @@ -108,7 +108,7 @@ void KColorValueSelector::setChooserMode( KColorChooserMode c ) d->_mode = c; //really needed? - //emit modeChanged(); + //Q_EMIT modeChanged(); } KColorChooserMode KColorValueSelector::chooserMode () const diff --git a/lima_annoqt/src/kxyselector.cpp b/lima_annoqt/src/kxyselector.cpp index 92cf6a128..d372b1bad 100644 --- a/lima_annoqt/src/kxyselector.cpp +++ b/lima_annoqt/src/kxyselector.cpp @@ -188,7 +188,7 @@ void KXYSelector::mouseMoveEvent( QMouseEvent *e ) valuesFromPosition( e->pos().x() - w, e->pos().y() - w, xVal, yVal ); setValues( xVal, yVal ); - emit valueChanged( d->xPos, d->yPos ); + Q_EMIT valueChanged( d->xPos, d->yPos ); } void KXYSelector::wheelEvent( QWheelEvent *e ) @@ -198,7 +198,7 @@ void KXYSelector::wheelEvent( QWheelEvent *e ) else setValues( xValue(), yValue() + e->delta()/120 ); - emit valueChanged( d->xPos, d->yPos ); + Q_EMIT valueChanged( d->xPos, d->yPos ); } void KXYSelector::valuesFromPosition( int x, int y, int &xVal, int &yVal ) const diff --git a/lima_annoqt/src/specificEntity.cpp b/lima_annoqt/src/specificEntity.cpp index 4d624b77f..f41e02358 100644 --- a/lima_annoqt/src/specificEntity.cpp +++ b/lima_annoqt/src/specificEntity.cpp @@ -55,6 +55,6 @@ SpecificEntity& SpecificEntity::operator=(const SpecificEntity& se) void SpecificEntity::slotTriggered() { qDebug() << "SpecificEntity::slotTriggered"; - emit triggered( this ); + Q_EMIT triggered( this ); } diff --git a/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp b/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp index 0624d20b0..1808d9e66 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp +++ b/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp @@ -44,7 +44,7 @@ EvaluationResultDimension::~EvaluationResultDimension() void EvaluationResultDimension::updateVisibleChanged (int state) { visibilityState = (Qt::CheckState)state; - emit visibleChanged(); + Q_EMIT visibleChanged(); } #include "EvaluationResultDimension.moc" diff --git a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h index c45776f83..1d0e42486 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h +++ b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h @@ -47,7 +47,7 @@ Q_OBJECT EvaluationResultDimension (QString n, QString k, QColor c, int i, bool r); -signals: +Q_SIGNALS: void visibleChanged (); public Q_SLOTS: diff --git a/lima_pelf/benchmarkingTool/Pipeline.cpp b/lima_pelf/benchmarkingTool/Pipeline.cpp index c8eb88528..eb7061237 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.cpp +++ b/lima_pelf/benchmarkingTool/Pipeline.cpp @@ -121,7 +121,7 @@ void Pipeline::moveUnits (QModelIndexList sourceIndexes, QModelIndex targetIndex return; qDebug() << "Reordering pipeline files"; QList::const_iterator sourceIndexIt; - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); qSort(sourceIndexes); QList movedUnits; int shiftSourceIndexes = 0, shiftTargetIndex = 0, targetIndexRow = targetIndex.row(); @@ -138,7 +138,7 @@ void Pipeline::moveUnits (QModelIndexList sourceIndexes, QModelIndex targetIndex units.insert(units.begin() + targetIndex.row() + shiftTargetIndex, movedUnit); shiftTargetIndex++; } - emit layoutChanged(); + Q_EMIT layoutChanged(); pipelineView->clearSelection(); unitsUpdate(); setDirty(); @@ -162,12 +162,12 @@ void Pipeline::deleteUnits (QModelIndexList unitIndexes) { qDebug() << "Removing pipeline files"; QList::const_iterator unitIndexesIt; - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); qSort(unitIndexes); int shiftIndexes = 0; for(unitIndexesIt = unitIndexes.constBegin(); unitIndexesIt != unitIndexes.constEnd(); unitIndexesIt++) shiftIndexes = deleteUnit(*unitIndexesIt, shiftIndexes); - emit layoutChanged(); + Q_EMIT layoutChanged(); setDirty(); pipelineView->clearSelection(); } @@ -190,12 +190,12 @@ int Pipeline::deleteUnit (QModelIndex unitIndex, int shiftIndexes) void Pipeline::clearUnits () { - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); QList::iterator unitsIt = units.begin(); for(; unitsIt < units.end(); unitsIt++) delete (*unitsIt); units.clear(); - emit layoutChanged(); + Q_EMIT layoutChanged(); pipelineView->clearSelection(); unitsUpdate(); setDirty(); @@ -225,7 +225,7 @@ bool Pipeline::resetBenchmarking () for(; unitsIt < units.end(); unitsIt++) (*unitsIt)->status = PipelineUnit::STATUS_UNPROCESSED; pipelineView->reset(); - emit resultsChanged(); + Q_EMIT resultsChanged(); qDebug() << "Reseted pipeline files processing states"; processing = false; return true; @@ -318,7 +318,7 @@ void Pipeline::continueBenchmarking () for(; unitsIt != units.end(); unitsIt++) if((*unitsIt)->status == PipelineUnit::STATUS_PROCESSED) (*unitsIt)->status = PipelineUnit::STATUS_UNPROCESSED; - emit finishedBenchmarking(); + Q_EMIT finishedBenchmarking(); } } @@ -334,7 +334,7 @@ void Pipeline::unitsUpdate () newResultUnits[*unitsIt] = result->resultUnits[*unitsIt]; result->resultUnits = newResultUnits; } - emit unitsChanged(); + Q_EMIT unitsChanged(); } void Pipeline::unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers) @@ -354,7 +354,7 @@ void Pipeline::unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers) qDebug() << "Pipeline::unitResultsChanged startTime"<reset(); - emit resultsChanged(); + Q_EMIT resultsChanged(); if(processing) continueBenchmarking(); } diff --git a/lima_pelf/benchmarkingTool/Pipeline.h b/lima_pelf/benchmarkingTool/Pipeline.h index c89008a2b..e313302cd 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.h +++ b/lima_pelf/benchmarkingTool/Pipeline.h @@ -85,7 +85,7 @@ public Q_SLOTS: void unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers); -signals: +Q_SIGNALS: void unitsChanged (); void resultsChanged (); diff --git a/lima_pelf/benchmarkingTool/PipelineUnit.cpp b/lima_pelf/benchmarkingTool/PipelineUnit.cpp index 2962b3e71..458ca62a1 100644 --- a/lima_pelf/benchmarkingTool/PipelineUnit.cpp +++ b/lima_pelf/benchmarkingTool/PipelineUnit.cpp @@ -108,7 +108,7 @@ void PipelineUnit::commandFinished (int exitCode, QProcess::ExitStatus exitStatu resultSet->findEvaluationResults(output); // qDebug() << "Finished benchmarking file " << name << ", evaluation results found, processing results output"; status = STATUS_PROCESSED; - emit unitResultsChanged(this, resultSet); + Q_EMIT unitResultsChanged(this, resultSet); } } diff --git a/lima_pelf/benchmarkingTool/ResultsWidget.cpp b/lima_pelf/benchmarkingTool/ResultsWidget.cpp index 938668af7..c80d87d0c 100644 --- a/lima_pelf/benchmarkingTool/ResultsWidget.cpp +++ b/lima_pelf/benchmarkingTool/ResultsWidget.cpp @@ -114,7 +114,7 @@ void ResultsWidget::contextDelete () ) == QMessageBox::Ok) { pipeline->results.remove(benchmarkingResultTime); - emit resultsChanged(); + Q_EMIT resultsChanged(); } } @@ -122,7 +122,7 @@ void ResultsWidget::contextView () { if(selectedIndexes().size() <= 0) return; - emit viewResult(selectedIndexes().first().row()); + Q_EMIT viewResult(selectedIndexes().first().row()); } diff --git a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp index 03f166bfe..b82ebcabd 100644 --- a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp +++ b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp @@ -74,7 +74,7 @@ void DictionnaryEntryEditDlg::submit () dictionnaryEntry->normalization = normalizationInp->text(); dictionnaryEntry->category = categoryCb->itemText(categoryCb->currentIndex()); dictionnaryEntry->displayable = true; - emit updateEntry(dictionnaryEntry); + Q_EMIT updateEntry(dictionnaryEntry); } #include "DictionnaryEntryEditDlg.moc" diff --git a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h index 481fd8355..f6ab07fb3 100644 --- a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h +++ b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h @@ -41,7 +41,7 @@ Q_OBJECT DictionnaryEntryEditDlg (QWidget* parent = 0); void init (ResourceEditorTableModel* rem, AbstractResourceEntry* are = 0); -signals: +Q_SIGNALS: void updateEntry (AbstractResourceEntry* de); public Q_SLOTS: diff --git a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp index fd7984f80..3c305f51b 100644 --- a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp +++ b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp @@ -87,7 +87,7 @@ void IdiomaticExpressionEntryEditDlg::submit () idiomaticExpressionEntry->lemma = lemmaInp->text(); idiomaticExpressionEntry->contextual = contextualCb->itemText(contextualCb->currentIndex()); idiomaticExpressionEntry->displayable = true; - emit updateEntry(idiomaticExpressionEntry); + Q_EMIT updateEntry(idiomaticExpressionEntry); } #include "IdiomaticExpressionEntryEditDlg.moc" diff --git a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h index ab978d11e..646756487 100644 --- a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h +++ b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h @@ -41,7 +41,7 @@ Q_OBJECT IdiomaticExpressionEntryEditDlg (QWidget* parent = 0); void init (ResourceEditorTableModel* rem, AbstractResourceEntry* are = 0); -signals: +Q_SIGNALS: void updateEntry (AbstractResourceEntry* de); public Q_SLOTS: diff --git a/lima_pelf/resourceTool/ResourceEditorTableModel.cpp b/lima_pelf/resourceTool/ResourceEditorTableModel.cpp index 17e54d515..eea78c1fc 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableModel.cpp +++ b/lima_pelf/resourceTool/ResourceEditorTableModel.cpp @@ -64,7 +64,7 @@ void ResourceEditorTableModel::addEntry (AbstractResourceEntry* de) { dataModified = true; availableData << de; - emit dataChanged(); + Q_EMIT dataChanged(); } void ResourceEditorTableModel::deleteEntries (QModelIndexList indexList) @@ -77,7 +77,7 @@ void ResourceEditorTableModel::deleteEntries (QModelIndexList indexList) entriesList << entry; for(int i = 0; i < entriesList.size(); i++) availableData.removeAll(entriesList.at(i)); - emit dataChanged(); + Q_EMIT dataChanged(); } #include "ResourceEditorTableModel.moc" diff --git a/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp b/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp index b8800e42f..8d1dd5db6 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp +++ b/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp @@ -45,12 +45,12 @@ void ResourceEditorTableWidget::editEntry (const QModelIndex& index) if(entry == 0) return; retm->dataModified = true; - emit editEntryDlg(entry); + Q_EMIT editEntryDlg(entry); } void ResourceEditorTableWidget::createEntry () { - emit editEntryDlg(); + Q_EMIT editEntryDlg(); } void ResourceEditorTableWidget::keyPressEvent (QKeyEvent* event) diff --git a/lima_pelf/resourceTool/ResourceEditorTableWidget.h b/lima_pelf/resourceTool/ResourceEditorTableWidget.h index b2b4dbb2c..0865d64b9 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableWidget.h +++ b/lima_pelf/resourceTool/ResourceEditorTableWidget.h @@ -50,7 +50,7 @@ public Q_SLOTS: void contextEdit (); void contextDelete (); -signals: +Q_SIGNALS: void editEntryDlg (AbstractResourceEntry* are = 0); diff --git a/lima_pelf/resourceTool/ResourceReaderSimpleModel.h b/lima_pelf/resourceTool/ResourceReaderSimpleModel.h index 7fbeec6a5..1292889e8 100644 --- a/lima_pelf/resourceTool/ResourceReaderSimpleModel.h +++ b/lima_pelf/resourceTool/ResourceReaderSimpleModel.h @@ -81,7 +81,7 @@ class ResourceReaderSimpleModel void install (); virtual void installFinished (int exitCode, QProcess::ExitStatus exitStatus); virtual void installError (QProcess::ProcessError error); - virtual void emitDataInstalled (bool success) = 0; // Virtual emit function (needed for class to be polymorphic) + virtual void emitDataInstalled (bool success) = 0; // Virtual Q_EMIT function (needed for class to be polymorphic) protected: QString installComand; diff --git a/lima_pelf/resourceTool/ResourceReaderTableModel.cpp b/lima_pelf/resourceTool/ResourceReaderTableModel.cpp index b95dfcf22..51e2fbf00 100644 --- a/lima_pelf/resourceTool/ResourceReaderTableModel.cpp +++ b/lima_pelf/resourceTool/ResourceReaderTableModel.cpp @@ -72,7 +72,7 @@ void ResourceReaderTableModel::sortByHeader (int column, Qt::SortOrder order) sortedHeaderColumn = column; sortedHeaderOrder = order; qSort(availableData.begin(), availableData.end(), headerLessThan); - emit dataChanged(); + Q_EMIT dataChanged(); } bool ResourceReaderTableModel::headerLessThan (AbstractResourceEntry* entry1, AbstractResourceEntry* entry2) diff --git a/lima_pelf/resourceTool/ResourceReaderTableModel.h b/lima_pelf/resourceTool/ResourceReaderTableModel.h index 427e1d305..05a4950bb 100644 --- a/lima_pelf/resourceTool/ResourceReaderTableModel.h +++ b/lima_pelf/resourceTool/ResourceReaderTableModel.h @@ -52,7 +52,7 @@ Q_OBJECT void sortByHeader (int column, Qt::SortOrder order); static bool headerLessThan (AbstractResourceEntry* entry1, AbstractResourceEntry* entry2); QVariant data (const QModelIndex& index, int role) const; - void emitDataInstalled (bool success) { emit dataInstalled(success); }; // ResourceReaderSimpleModel needed emit function + void emitDataInstalled (bool success) { Q_EMIT dataInstalled(success); }; // ResourceReaderSimpleModel needed Q_EMIT function Q_SIGNALS: diff --git a/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h b/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h index 7186f0771..0fe834881 100644 --- a/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h +++ b/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h @@ -39,7 +39,7 @@ Q_OBJECT ResourceReaderToolBoxModel (QObject * parent = 0); virtual ~ResourceReaderToolBoxModel(); - void emitDataInstalled (bool success) { emit dataInstalled(success); }; // ResourceReaderSimpleModel needed emit function + void emitDataInstalled (bool success) { Q_EMIT dataInstalled(success); }; // ResourceReaderSimpleModel needed Q_EMIT function Q_SIGNALS: void dataInstalled (bool success); // ResourceReaderSimpleModel needed signal From b68125fc3a0b1bbb9d7373665ce781257cad3881 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 01:44:32 +0200 Subject: [PATCH 44/82] Some Qt keywords were remaining --- lima_annoqt/src/annoqt.cpp | 6 +-- lima_annoqt/src/kcolorcollection.cpp | 2 +- .../PosTagger/DynamicSvmToolPosTagger.cpp | 2 +- .../benchmarkingTool/BenchmarkingTool.cpp | 42 +++++++++---------- .../BenchmarkingXmlWriter.cpp | 8 ++-- .../benchmarkingTool/EvaluationResult.cpp | 4 +- lima_pelf/benchmarkingTool/Pipeline.cpp | 4 +- 7 files changed, 34 insertions(+), 34 deletions(-) diff --git a/lima_annoqt/src/annoqt.cpp b/lima_annoqt/src/annoqt.cpp index d537ead74..523cafe7c 100644 --- a/lima_annoqt/src/annoqt.cpp +++ b/lima_annoqt/src/annoqt.cpp @@ -405,7 +405,7 @@ bool Annoqt::saveFile( const QString &fileName ) // std::cerr<document()->toHtml("utf-8").toUtf8().data() << std::endl; - foreach (SpecificEntity* entity, m_entities) + Q_FOREACH (SpecificEntity* entity, m_entities) { QString string = entity->string(); QRegExp rxamp("&(?!amp;)"); @@ -463,7 +463,7 @@ Annoqt::~Annoqt() { qDebug() << "Annoqt::~Annoqt"; - foreach (SpecificEntity* se, m_entities) + Q_FOREACH (SpecificEntity* se, m_entities) { delete se; } @@ -801,7 +801,7 @@ void Annoqt::computeEntitiesMap() { qDebug() << "Annoqt::computeEntitiesMap"; m_entitiesMap.clear(); - foreach (SpecificEntity* entity, m_entities) + Q_FOREACH (SpecificEntity* entity, m_entities) { for (quint32 i = entity->position(); i < entity->position()+entity->length(); i++) { diff --git a/lima_annoqt/src/kcolorcollection.cpp b/lima_annoqt/src/kcolorcollection.cpp index ac9801f63..d690fdd10 100644 --- a/lima_annoqt/src/kcolorcollection.cpp +++ b/lima_annoqt/src/kcolorcollection.cpp @@ -152,7 +152,7 @@ KColorCollection::save() str << "KDE RGB Palette\n"; str << description << "\n"; - foreach (const KColorCollectionPrivate::ColorNode &node, d->colorList) + Q_FOREACH (const KColorCollectionPrivate::ColorNode &node, d->colorList) { int r,g,b; node.color.getRgb(&r, &g, &b); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp index 820002935..8c923ed56 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include // LDBL_MIN/MAX #include // log diff --git a/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp b/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp index 1c6f41239..d1baca566 100644 --- a/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp +++ b/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp @@ -128,7 +128,7 @@ void BenchmarkingTool::init () pipeGraphsSplitter->restoreState(settings->value ("pipeGraphsSplitter").toByteArray()); QStringList textFiles = settings->value ("textFiles").toStringList(); - foreach (QString textFile, textFiles) + Q_FOREACH (QString textFile, textFiles) { recentFilesList->addItem(textFile); } @@ -162,11 +162,11 @@ void BenchmarkingTool::updateErrorsWidget() QMultiMap errors = utterancesWithErrors(selectedUnitTextPath); // qDebug() << "BenchmarkingTool::updateErrorsWidget utterancesWithErrors:" << errors.size(); - foreach (const QString& key, errors.keys().toSet()) + Q_FOREACH (const QString& key, errors.keys().toSet()) { QList list = errors.values(key); qSort(list); - foreach(const QString& value, list) + Q_FOREACH(const QString& value, list) { // qDebug() << "add child item" << key << value; QTreeWidgetItem * childItem = new QTreeWidgetItem(); @@ -204,7 +204,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co BenchmarkingResult* benchResult = (pipeline->results)[pipeline->startTime]; QMap& puResult = benchResult->resultUnits; qDebug() << "BenchmarkingTool::slotErrorStatementActivated " << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { // qDebug() << "selectedUnitTextPath: " << selectedUnitTextPath << "; pipelineName: " << pipelineName; if (pipelineName != pipelineUnit->name) continue; @@ -227,7 +227,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co QList previousErrors = getErrors(pipelineName, statementId, m_previousBenchmarkingResult); qDebug() << "previousErrors" << pipelineName << statementId << ":" << previousErrors; - foreach (const QStringList& list, errors) + Q_FOREACH (const QStringList& list, errors) { if (!previousErrors.contains(list)) { @@ -238,7 +238,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co } } - foreach (const QStringList& list, previousErrors) + Q_FOREACH (const QStringList& list, previousErrors) { if (!errors.contains(list)) { @@ -328,7 +328,7 @@ void BenchmarkingTool::resetEvaluationCurves () // recallQwtPlot->clear(); evaluationResultTypeQwtCurves.clear(); QMap& dimensions = EvaluationResult::getDimensions(); - foreach(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) // dimensionsIt = dimensions.begin(); dimensionsIt != dimensions.end(); dimensionsIt++) { EvaluationResultDimension* dimension = dimensions[dimensionId]; @@ -382,7 +382,7 @@ void BenchmarkingTool::updateDimensionsWidgets () QCheckBox* dimensionCheckBox; QLabel* dimensionLabel; QMap& dimensions = EvaluationResult::getDimensions(); - foreach(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) { EvaluationResultDimension* dimension = dimensions[dimensionId]; dimensionCheckBox = new QCheckBox(this); @@ -459,7 +459,7 @@ void BenchmarkingTool::pipelineUnitsChanged () pipelineUnitDisplayCb->addItem("All pipeline units"); const QList& pipelineUnits = pipeline->getUnits(); int unitId = 0; - foreach(PipelineUnit* unit, pipelineUnits) + Q_FOREACH(PipelineUnit* unit, pipelineUnits) { pipelineUnitDisplayCb->addItem(unit->name, unitId); unitId++; @@ -477,17 +477,17 @@ void BenchmarkingTool::updateResultsViews() if(pipelineUnitDisplayCb->currentIndex() > 0) selectedUnitTextPath = pipeline->getUnits()[pipelineUnitDisplayCb->currentIndex() - 1]->textPath; ResultsModel::selectedUnitTextPath = selectedUnitTextPath; - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { EvaluationResult::DIMENSION_ID dimensionId = (EvaluationResult::DIMENSION_ID)dimension->id; int noResults = 1; double xFmeasure[nbRes], yFmeasure[nbRes], xPrecision[nbRes], yPrecision[nbRes], xRecall[nbRes], yRecall[nbRes]; - foreach (BenchmarkingResult* result, pipeline->results) + Q_FOREACH (BenchmarkingResult* result, pipeline->results) { QMap& puResult = result->resultUnits; int nbPus = 0; double sumFc = 0, sumFp = 0, sumCr = 0; - foreach (PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH (PipelineUnit* pipelineUnit, puResult.keys()) { if(selectedUnitTextPath.isEmpty() || selectedUnitTextPath == pipelineUnit->textPath) { @@ -927,7 +927,7 @@ void BenchmarkingTool::slotTextFileActivated(QListWidgetItem* item) void BenchmarkingTool::slotRemoveTextFile() { QList items = recentFilesList->selectedItems(); - foreach(QListWidgetItem* item, items) + Q_FOREACH(QListWidgetItem* item, items) { recentFilesList->takeItem(recentFilesList->row(item)); } @@ -985,7 +985,7 @@ void BenchmarkingTool::compareWith(const QString& otherFilename) qDebug() << "BenchmarkingTool::compareWith"; QMultiMap utterancesSet = utterancesWithErrors(selectedUnitTextPath); QString utterances; - foreach (const QString& utt, utterancesSet.values(selectedUnitTextPath)) + Q_FOREACH (const QString& utt, utterancesSet.values(selectedUnitTextPath)) { utterances += utt + ","; } @@ -1043,7 +1043,7 @@ QMultiMap BenchmarkingTool::utterancesWithErrors(BenchmarkingRe // words if it is not the last one QMap puResult = benchmarkingResult->resultUnits; // qDebug() << "BenchmarkingTool::utterancesWithErrors " << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { QString pipelineName = pipelineUnit->name; // qDebug() << "selectedUnitTextPath: " << selectedUnitTextPath << "; pipelineName: " << pipelineName; @@ -1065,7 +1065,7 @@ QMultiMap BenchmarkingTool::utterancesWithErrors(BenchmarkingRe keys.unite(QSet::fromList(fals.keys())); keys.unite(QSet::fromList(type.keys())); // qDebug() << "utterances insert" << pipelineName << keys; - foreach(const QString& key, keys) + Q_FOREACH(const QString& key, keys) { if (!utterances.values(pipelineName).contains(key)) utterances.insert(pipelineName,key); @@ -1123,10 +1123,10 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult // qDebug() << "BenchmarkingTool::updateErrorsWidget (compare) AFTER UTTERANCES WITH ERRORS"; QMap alreadyInserted; - foreach (const QString& key, errorUtterances.keys().toSet()) + Q_FOREACH (const QString& key, errorUtterances.keys().toSet()) { // qDebug() << "key" << key; - foreach(const QString& value, errorUtterances.values(key)) + Q_FOREACH(const QString& value, errorUtterances.values(key)) { // qDebug() << "value" << value; QList errors = getErrors(key, value, benchmarkingResult); @@ -1134,7 +1134,7 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult QList previousErrors = getErrors(key, value, previousBenchmarkingResult); qDebug() << "previousErrors" << key << value << ":" << previousErrors; - foreach (const QStringList& list, errors) + Q_FOREACH (const QStringList& list, errors) { if (!previousErrors.contains(list) && !(alreadyInserted.contains(key) && alreadyInserted.values(key).contains(value))) { @@ -1147,7 +1147,7 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult alreadyInserted[key] = value; } } - foreach (const QStringList& list, previousErrors) + Q_FOREACH (const QStringList& list, previousErrors) { if (!errors.contains(list) && !(alreadyInserted.contains(key) && alreadyInserted.values(key).contains(value))) { @@ -1173,7 +1173,7 @@ QList BenchmarkingTool::getErrors(const QString& unit, const QStrin QMap& puResult = benchResult->resultUnits; qDebug() << "BenchmarkingTool::getErrors" << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { QString pipelineName = pipelineUnit->name; qDebug() << "unit: " << unit << "; pipelineName: " << pipelineName; diff --git a/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp b/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp index d0f2aaa75..10020bf08 100644 --- a/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp +++ b/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp @@ -45,7 +45,7 @@ bool BenchmarkingXmlWriter::write() writeStartElement("pipeline"); int unitId = 0; const QList& pipelineUnits = m_pipeline->getUnits(); - foreach (PipelineUnit* unit, pipelineUnits) + Q_FOREACH (PipelineUnit* unit, pipelineUnits) { writeStartElement("unit"); unit->fileId = ++unitId; @@ -59,20 +59,20 @@ bool BenchmarkingXmlWriter::write() writeEndElement(); writeStartElement("evaluations"); - foreach (BenchmarkingResult* benchmarkingResult, m_pipeline->results) + Q_FOREACH (BenchmarkingResult* benchmarkingResult, m_pipeline->results) { writeStartElement("pipelineEvaluation"); QDateTime evaluationTime = benchmarkingResult->time; writeAttribute("time", QString::number(evaluationTime.toTime_t())); writeAttribute("comments", benchmarkingResult->comment); - foreach (PipelineUnit* unit, pipelineUnits) + Q_FOREACH (PipelineUnit* unit, pipelineUnits) { if(benchmarkingResult->resultUnits.contains(unit)) { writeStartElement("unitEvaluation"); writeAttribute("unitId", QString::number(unit->fileId)); EvaluationResultSet* unitResults = benchmarkingResult->resultUnits[unit]; - foreach(EvaluationResult::DIMENSION_ID dimensionId, unitResults->keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, unitResults->keys()) { EvaluationResult* result = (*unitResults)[dimensionId]; writeStartElement("result"); diff --git a/lima_pelf/benchmarkingTool/EvaluationResult.cpp b/lima_pelf/benchmarkingTool/EvaluationResult.cpp index ab445ae91..63d11d00e 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResult.cpp +++ b/lima_pelf/benchmarkingTool/EvaluationResult.cpp @@ -195,12 +195,12 @@ QList EvaluationResult::getDimensionsVisible () { QList dimensionsVisible; bool masked = false; - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { if(dimension->visibilityState == Qt::PartiallyChecked) masked = true; } - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { int dimensionGroupId = getDimensionGroup((DIMENSION_ID)dimension->id); bool groupChecked = dimensionGroupId == -1 || dimensions[(DIMENSION_ID)dimensionGroupId]->visibilityState == Qt::Checked; diff --git a/lima_pelf/benchmarkingTool/Pipeline.cpp b/lima_pelf/benchmarkingTool/Pipeline.cpp index eb7061237..a3b7e040f 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.cpp +++ b/lima_pelf/benchmarkingTool/Pipeline.cpp @@ -133,7 +133,7 @@ void Pipeline::moveUnits (QModelIndexList sourceIndexes, QModelIndex targetIndex if(sourceIndexRow < targetIndexRow) shiftTargetIndex = shiftSourceIndexes; } - foreach(PipelineUnit* movedUnit, movedUnits) + Q_FOREACH(PipelineUnit* movedUnit, movedUnits) { units.insert(units.begin() + targetIndex.row() + shiftTargetIndex, movedUnit); shiftTargetIndex++; @@ -325,7 +325,7 @@ void Pipeline::continueBenchmarking () void Pipeline::unitsUpdate () { QMap::iterator resultsIt; - foreach (BenchmarkingResult* result, results) + Q_FOREACH (BenchmarkingResult* result, results) { QMap newResultUnits; QList::iterator unitsIt = units.begin(); From 2dadccf90d4539843623099ef4f2c364be5e8396 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 01:54:56 +0200 Subject: [PATCH 45/82] Make Python libraries optional --- lima_linguisticprocessing/CMakeLists.txt | 8 +++--- .../core/SemanticAnalysis/CMakeLists.txt | 12 +++++++-- lima_linguisticprocessing/test/CMakeLists.txt | 26 +++++++++---------- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/lima_linguisticprocessing/CMakeLists.txt b/lima_linguisticprocessing/CMakeLists.txt index b61fb2c20..22aefa97b 100644 --- a/lima_linguisticprocessing/CMakeLists.txt +++ b/lima_linguisticprocessing/CMakeLists.txt @@ -124,9 +124,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") DESTINATION ${LIB_INSTALL_DIR}) endif () -find_package(PythonLibs 3.4 REQUIRED) -include_directories(${PYTHON_INCLUDE_DIRS}) -link_directories(${PYTHON_LIBRARIES}) +find_package(PythonLibs 3.4) +if (${PYTHONLIBS_FOUND}) + include_directories(${PYTHON_INCLUDE_DIRS}) + link_directories(${PYTHON_LIBRARIES}) +endif() #add_definitions( -DBOOST_ALL_NO_LIB ) add_definitions( -DBOOST_ALL_DYN_LINK ) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt index 3c169c243..1c176edb5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt @@ -22,7 +22,6 @@ add_definitions(-DLIMA_SEMANTICANALYSIS_EXPORTING) SET(lima-lp-semanticanalysis_LIB_SRCS ConstraintFunction.cpp - KnowledgeBasedSemanticRoleLabeler.cpp SemanticRelationAnnotation.cpp SemanticRelationData.cpp SemanticRoleLabelingLoader.cpp @@ -30,7 +29,16 @@ SET(lima-lp-semanticanalysis_LIB_SRCS SemanticRelationsXmlLogger.cpp ) -add_library(lima-lp-semanticanalysis SHARED ${lima-lp-semanticanalysis_LIB_SRCS}) +if (${PYTHONLIBS_FOUND}) + SET(lima-lp-semanticanalysis_LIB_SRCS + KnowledgeBasedSemanticRoleLabeler.cpp + ${lima-lp-semanticanalysis_LIB_SRCS} + ) +endif() +DECLARE_LIMA_PLUGIN(lima-lp-semanticanalysis) + +#add_library(lima-lp-semanticanalysis SHARED ${lima-lp-semanticanalysis_LIB_SRCS}) + target_link_libraries(lima-lp-semanticanalysis lima-common-factory diff --git a/lima_linguisticprocessing/test/CMakeLists.txt b/lima_linguisticprocessing/test/CMakeLists.txt index 7f1ab9fbd..32e11f6dc 100644 --- a/lima_linguisticprocessing/test/CMakeLists.txt +++ b/lima_linguisticprocessing/test/CMakeLists.txt @@ -63,19 +63,19 @@ target_link_libraries(analyzeText install(TARGETS analyzeText DESTINATION bin) ########### next target ############### - -SET(srl_SRCS - srl.cpp -) - -add_executable(srl ${srl_SRCS}) -target_link_libraries(srl - ${PYTHON_LIBRARY} - ${QT_LIBRARIES} -) - -install(TARGETS srl DESTINATION bin) - +if (${PYTHONLIBS_FOUND}) + SET(srl_SRCS + srl.cpp + ) + + add_executable(srl ${srl_SRCS}) + target_link_libraries(srl + ${PYTHON_LIBRARY} + ${QT_LIBRARIES} + ) + + install(TARGETS srl DESTINATION bin) +endif() ########### next target ############### # SET(threadedAnalyzeText_SRCS From 2b053c288da6a56c2e950b2efcab20201e9bb1a1 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 02:02:00 +0200 Subject: [PATCH 46/82] Missing cmake if for python --- .../core/SemanticAnalysis/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt index 1c176edb5..49d62032f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt @@ -62,9 +62,14 @@ target_link_libraries(lima-lp-semanticanalysis ${optionalLibs} ${Boost_LIBRARIES} ${QT_LIBRARIES} - ${PYTHON_LIBRARY} ) +if (${PYTHONLIBS_FOUND}) + target_link_libraries(lima-lp-semanticanalysis + ${PYTHON_LIBRARY} + ) +endif() + set_target_properties(lima-lp-semanticanalysis PROPERTIES VERSION ${LIMA_LP_LIB_VERSION} SOVERSION ${LIMA_LP_LIB_SOVERSION}) install(TARGETS lima-lp-semanticanalysis DESTINATION lib) From 9631877ee1ea2022c0e5899e8d15a66672e9df74 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Apr 2016 12:23:43 +0200 Subject: [PATCH 47/82] Break circular dependency under Windows Move ConllDumper to semantic analysis --- .../linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt | 1 - .../linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt | 3 ++- .../{AnalysisDumpers => SemanticAnalysis}/ConllDumper.cpp | 0 .../core/{AnalysisDumpers => SemanticAnalysis}/ConllDumper.h | 4 ++-- .../SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename lima_linguisticprocessing/src/linguisticProcessing/core/{AnalysisDumpers => SemanticAnalysis}/ConllDumper.cpp (100%) rename lima_linguisticprocessing/src/linguisticProcessing/core/{AnalysisDumpers => SemanticAnalysis}/ConllDumper.h (92%) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt index f1967236a..10d787e05 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt @@ -31,7 +31,6 @@ SET(lima-lp-analysisdumpers_LIB_SRCS NullDumper.cpp StopList.cpp TextDumper.cpp - ConllDumper.cpp fullXmlDumper.cpp linearTextRepresentationDumper.cpp linearTextRepresentationLogger.cpp diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt index 49d62032f..3f19bb33a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt @@ -27,6 +27,7 @@ SET(lima-lp-semanticanalysis_LIB_SRCS SemanticRoleLabelingLoader.cpp LimaConllTokenIdMapping.cpp SemanticRelationsXmlLogger.cpp + ConllDumper.cpp ) if (${PYTHONLIBS_FOUND}) @@ -66,7 +67,7 @@ target_link_libraries(lima-lp-semanticanalysis if (${PYTHONLIBS_FOUND}) target_link_libraries(lima-lp-semanticanalysis - ${PYTHON_LIBRARY} + ${PYTHON_LIBRARY} ) endif() diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp similarity index 100% rename from lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp rename to lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h similarity index 92% rename from lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h rename to lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h index 86ec375dd..6ca27be3b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h @@ -19,7 +19,7 @@ #ifndef LIMA_LINGUISTICPROCESSING_ANALYSISDUMPERSTEXTDUMPER_H #define LIMA_LINGUISTICPROCESSING_ANALYSISDUMPERSTEXTDUMPER_H -#include "AnalysisDumpersExport.h" +#include "SemanticAnalysisExport.h" #include "linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h" namespace Lima @@ -36,7 +36,7 @@ class ConllDumperPrivate; /** @author Gael de Chalendar */ -class LIMA_ANALYSISDUMPERS_EXPORT ConllDumper : public AbstractTextualAnalysisDumper +class LIMA_SEMANTICANALYSIS_EXPORT ConllDumper : public AbstractTextualAnalysisDumper { public: ConllDumper(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index 7faaef03d..c515050d1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -23,7 +23,7 @@ #include "common/misc/Exceptions.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "linguisticProcessing/core/AnalysisDumpers/ConllDumper.h" +#include "linguisticProcessing/core/SemanticAnalysis/ConllDumper.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" #include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" #include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" From b87098185ea87eda793b0fade32c50309bf4ab29 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 09:58:41 +0200 Subject: [PATCH 48/82] Don't add predicates for should not be kept tokens --- .../core/AnalysisDumpers/BowGeneration.cpp | 66 +++++++++++-------- .../core/AnalysisDumpers/BowGeneration.h | 2 +- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index 1e37716f8..b978962ea 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -589,25 +589,20 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs #endif // note: anaVertices size should be 0 or 1 - //for (std::set< uint64_t >::const_iterator anaVerticesIt = anaVertices.begin(); portage 32 64 - for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); - anaVerticesIt != anaVertices.end(); anaVerticesIt++) + for ( AnnotationGraphVertex anaVertex : anaVertices) { - #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << *anaVerticesIt; +#ifdef DEBUG_LP + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << anaVertex; #endif - //std::set< uint64_t > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); portage 32 64 - std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); - //for (std::set< uint64_t >::const_iterator it = matches.begin(); portage 32 64 - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",anaVertex,"annot"); + for (AnnotationGraphVertex matchVertex: matches) { #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << *it; + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << matchVertex; #endif - if (annotationData->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + if (annotationData->hasAnnotation(matchVertex, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - boost::shared_ptr< BoWToken > se = createSpecificEntity(v,*it, annotationData, anagraph, posgraph, offsetBegin, false); + boost::shared_ptr< BoWToken > se = createSpecificEntity(v,matchVertex, annotationData, anagraph, posgraph, offsetBegin, false); if (se != 0) { #ifdef DEBUG_LP @@ -631,16 +626,14 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement there are " << matches.size() << " annotation graph vertices matching the current PsGraph vertex " << v; #endif - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) + for (AnnotationGraphVertex vx: matches) { - AnnotationGraphVertex vx=*it; #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << vx; #endif - if (annotationData->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - boost::shared_ptr< BoWToken > se = createSpecificEntity(v,*it, annotationData, anagraph, posgraph, offsetBegin); + boost::shared_ptr< BoWToken > se = createSpecificEntity(v,vx, annotationData, anagraph, posgraph, offsetBegin); if (se != 0) { #ifdef DEBUG_LP @@ -652,9 +645,9 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs return abstractBowEl; } } - else if (annotationData->hasIntAnnotation(*it, Common::Misc::utf8stdstring2limastring("CpdTense"))) + else if (annotationData->hasIntAnnotation(vx, Common::Misc::utf8stdstring2limastring("CpdTense"))) { - boost::shared_ptr< BoWToken > ct = createCompoundTense(*it, annotationData, anagraph, posgraph, offsetBegin, visited); + boost::shared_ptr< BoWToken > ct = createCompoundTense(vx, annotationData, anagraph, posgraph, offsetBegin, visited); if (ct != 0) { #ifdef DEBUG_LP @@ -666,20 +659,38 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs return abstractBowEl; } } - else if (annotationData->hasStringAnnotation(*it, Common::Misc::utf8stdstring2limastring("Predicate"))) + else if (annotationData->hasStringAnnotation(vx, Common::Misc::utf8stdstring2limastring("Predicate"))) { #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Found a predicate in the PosGraph annnotation graph matching"; #endif - boost::shared_ptr< BoWPredicate > bP=createPredicate(v, *it, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway); - if (bP!=0){ + + MorphoSyntacticData* data = get(vertex_data, posgraph, v); + bool toKeep = true; + if (data!=0) + { + for (const auto& elem: *data) + { + if (!keepAnyway && !shouldBeKept(elem)) + { + toKeep = false; + break; + } + } + } + if (toKeep) + { + boost::shared_ptr< BoWPredicate > bP=createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway); + if (bP!=0) + { #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; + LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; #endif - abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); - // visited.insert(v); - return abstractBowEl; + abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); + // visited.insert(v); + return abstractBowEl; + } } } else @@ -690,7 +701,6 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs } } - // bow tokens have been created for specific entities on the before PoS // tagging graph. return them if (!abstractBowEl.empty()) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h index 0922fafd6..87e0da543 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h @@ -123,7 +123,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT BowGenerator bool keepAnyway = false) const; /** - * Builds a BoWPredicate corresoonding to a semantic relation (an edge in the + * Builds a BoWPredicate corresponding to a semantic relation (an edge in the * annotation graph holding a SemanticRelation annotation * * @param lgvs source linguistic graph vertex From caeecec3d01c36bcdee06ef5eff3f2f0c48d84eb Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 13:43:56 +0200 Subject: [PATCH 49/82] Add FrameNet Modex --- .../SRLIntegration/CMakeLists.txt | 49 ++++++++++++++++++- .../SRLIntegration/FrameNet-eng.rules | 5 ++ .../SRLIntegration/FrameNet-fre.rules | 5 ++ .../SRLIntegration/FraneNet-modex.xml | 31 ++++++++++++ 4 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 lima_linguisticdata/SRLIntegration/FrameNet-eng.rules create mode 100644 lima_linguisticdata/SRLIntegration/FrameNet-fre.rules create mode 100644 lima_linguisticdata/SRLIntegration/FraneNet-modex.xml diff --git a/lima_linguisticdata/SRLIntegration/CMakeLists.txt b/lima_linguisticdata/SRLIntegration/CMakeLists.txt index c4e94b3fd..60bc982b6 100644 --- a/lima_linguisticdata/SRLIntegration/CMakeLists.txt +++ b/lima_linguisticdata/SRLIntegration/CMakeLists.txt @@ -32,6 +32,17 @@ foreach(LANG ${LIMA_LANGUAGES}) COMMENT "create config env for srl rules (VerbNet-modex.xml)" VERBATIM ) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + COMMENT "create config env for srl rules (FrameNet-modex.xml)" + VERBATIM + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${LANG}.xml COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config @@ -88,6 +99,15 @@ foreach(LANG ${LIMA_LANGUAGES}) DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-analysis.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common.xml ) + add_custom_target( + rules-${LANG}-FrameNet-configEnv + ALL + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${LANG}.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-lp-${LANG}.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-analysis.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common.xml + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/resources/LinguisticProcessings/${LANG}/code-${LANG}.xml @@ -137,7 +157,34 @@ foreach(LANG ${LIMA_LANGUAGES}) endif () + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules) + + add_custom_command( + OUTPUT FrameNet-${LANG}.bin + COMMAND compile-rules --resourcesDir=${LIMA_RESOURCES} --configDir=${LIMA_CONF} --language=${LANG} -oFrameNet-${LANG}.bin ${_current} --modex=FrameNet-modex.xml ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules + DEPENDS ${_current} ${DEPENDENCIES} +# WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + VERBATIM + ) + + add_custom_target( + rules-FrameNet-${LANG}-main + ALL + DEPENDS FrameNet-${LANG}.bin + ) + + # add the link between the current target and its execution environment dependencies + add_dependencies(rules-FrameNet-${LANG}-main rules-${LANG}-FrameNet-configEnv-main rules-${LANG}-execEnv) + + add_dependencies(rules-FrameNet-${LANG}-main rules-${LANG}-FrameNet-configEnv srl-rules-${LANG}-execEnv) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/FrameNet-${LANG}.bin COMPONENT ${LANG} DESTINATION share/apps/lima/resources/SRLIntegration) + + + endif () + endforeach(LANG ${LIMA_LANGUAGES}) -install(FILES VerbNet-modex.xml COMPONENT common DESTINATION share/config/lima) +install(FILES FrameNet-modex.xml VerbNet-modex.xml COMPONENT common DESTINATION share/config/lima) diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules new file mode 100644 index 000000000..04ea2862d --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules @@ -0,0 +1,5 @@ +set encoding=utf8 +using modex FameNet-modex.xml +using groups FameNet +set defaultAction=>CreateSpecificEntity() + diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules new file mode 100644 index 000000000..04ea2862d --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules @@ -0,0 +1,5 @@ +set encoding=utf8 +using modex FameNet-modex.xml +using groups FameNet +set defaultAction=>CreateSpecificEntity() + diff --git a/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml b/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml new file mode 100644 index 000000000..1a5a6c727 --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 51182d7313bbaecce3c1dcaa2d5171d85355fe4e Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 13:44:25 +0200 Subject: [PATCH 50/82] Correct very old memory corruption bug Detected thanks to the new sanitization gcc switches --- .../xmlConfigurationFileExceptions.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h index d62271d37..7f3a705cb 100644 --- a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h +++ b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h @@ -63,8 +63,6 @@ namespace Lima { NoSuchModule ( const std::string &name ) : XMLConfigurationFileException ( "No such module " + name ),moduleName ( name ) {/*std::cout << "No such module " << name << std::endl;*/} //! @brief destructor (throw the exception) virtual ~NoSuchModule() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such module " + moduleName );} private: NoSuchModule& operator=(const NoSuchModule&) {return *this;} //! @brief the name of the module that was not found @@ -79,8 +77,6 @@ namespace Lima { //! @param name the group that was not found NoSuchGroup ( const std::string &name ) : XMLConfigurationFileException ( "No such group " + name ),groupName ( name ) {/*std::cout << "No such group " << name << std::endl;*/} virtual ~NoSuchGroup() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such group " + groupName );} private: NoSuchGroup& operator=(const NoSuchGroup&) {return *this;} //! @brief the name of the group that was not found @@ -95,8 +91,6 @@ namespace Lima { //! @param name the attribute that was not found NoSuchAttribute ( const std::string &name ) : XMLConfigurationFileException ( "No such attribute " + name ),attName ( name ) {/*std::cout << "No such attribute " << name << std::endl;*/} virtual ~NoSuchAttribute() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such attribute " + attName );} private: NoSuchAttribute& operator=(const NoSuchAttribute&) {return *this;} //! @brief the name of the attribute that was not found @@ -111,8 +105,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchParam : public XMLConfigurationFil //! @param name the param that was not found NoSuchParam ( const std::string &name ) : XMLConfigurationFileException ( "No such param '" + name + "'" ),paramName ( name ) {/*std::cout << "No such param " << name << std::endl;*/} virtual ~NoSuchParam() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such param '" + paramName + "'" );} private: NoSuchParam& operator=(const NoSuchParam&) {return *this;} //! @brief the name of the param that was not found @@ -127,8 +119,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchList : public XMLConfigurationFile //! @param name the list that was not found NoSuchList ( const std::string &name ) : XMLConfigurationFileException ( "No such list " + name ),listName ( name ) {/*std::cout << "No such list " << name << std::endl;*/} virtual ~NoSuchList() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such list " + listName );} private: NoSuchList& operator=(const NoSuchList&) {return *this;} //! @brief the name of the list that was not found @@ -143,8 +133,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchMap : public XMLConfigurationFileE //! @param name the map that was not found NoSuchMap ( const std::string &name ) : XMLConfigurationFileException ( "No such map " + name ),mapName ( name ) {/*std::cout << "No such map " << name << std::endl;*/} virtual ~NoSuchMap() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such map " + mapName );} private: NoSuchMap& operator=(const NoSuchMap&) {return *this;} //! @brief the name of the map that was not found From 95af94c4fcfda5bf462a31bf6a0b98581874e25e Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 23:23:33 +0200 Subject: [PATCH 51/82] Complete the FrameNet Modex --- .../SRLIntegration/FrameNet-eng.rules | 4 +- .../SRLIntegration/FrameNet-fre.rules | 5 +- .../SRLIntegration/FrameNet-modex.xml | 2231 +++++++++++++++++ .../SRLIntegration/FraneNet-modex.xml | 31 - 4 files changed, 2236 insertions(+), 35 deletions(-) create mode 100644 lima_linguisticdata/SRLIntegration/FrameNet-modex.xml delete mode 100644 lima_linguisticdata/SRLIntegration/FraneNet-modex.xml diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules index 04ea2862d..4a7423057 100644 --- a/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules +++ b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules @@ -1,5 +1,5 @@ set encoding=utf8 -using modex FameNet-modex.xml -using groups FameNet +using modex FrameNet-modex.xml +using groups FrameNet set defaultAction=>CreateSpecificEntity() diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules index 04ea2862d..bc6c62c66 100644 --- a/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules +++ b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules @@ -1,5 +1,6 @@ set encoding=utf8 -using modex FameNet-modex.xml -using groups FameNet +using modex FrameNet-modex.xml +using groups FrameNet set defaultAction=>CreateSpecificEntity() + diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml b/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml new file mode 100644 index 000000000..418688b1e --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml @@ -0,0 +1,2231 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml b/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml deleted file mode 100644 index 1a5a6c727..000000000 --- a/lima_linguisticdata/SRLIntegration/FraneNet-modex.xml +++ /dev/null @@ -1,31 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From d6015ae82c5e9f6e48820f8b4381ed4be380dbad Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 23:24:27 +0200 Subject: [PATCH 52/82] Add information in exception messages --- lima_common/src/common/MediaticData/mediaticData.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lima_common/src/common/MediaticData/mediaticData.cpp b/lima_common/src/common/MediaticData/mediaticData.cpp index 1880c53ae..614fbe11c 100644 --- a/lima_common/src/common/MediaticData/mediaticData.cpp +++ b/lima_common/src/common/MediaticData/mediaticData.cpp @@ -768,15 +768,14 @@ EntityType MediaticData::getEntityType(const EntityGroupId groupId, MDATALOGINIT; LERROR << "MediaticData::getEntityType unknown entity group id " << groupId <<"accessing" << entityName; - throw LimaException(); + throw LimaException("MediaticData::getEntityType unknown entity group id"); } try { return EntityType(m_d->m_entityTypes[groupId]->get(entityName),groupId); } - catch(LimaException& ) { + catch(LimaException& e) { MDATALOGINIT; - LWARN << "Unknown entity type " - << entityName; + LWARN << "Unknown entity type " << entityName << "in group id:"< Date: Thu, 28 Apr 2016 23:25:16 +0200 Subject: [PATCH 53/82] Add information in exception messages --- lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc b/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc index ac32e0047..9eedf40d1 100644 --- a/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc +++ b/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc @@ -58,7 +58,7 @@ get(const Object& val) const { typename DoubleAccessObjectToIdMap::AccessMap::const_iterator it=m_accessMap.find(&val); if (it==m_accessMap.end()) { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::get(val) parameter not in map."); } else { return (*it).second; @@ -71,7 +71,7 @@ get(const Id& id) const { size_t i=(size_t) id; if (i >= m_reverseAccessMap.size()) { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::get(id) parameter not in reverse map."); } else { return *(m_reverseAccessMap[i]); @@ -115,7 +115,7 @@ operator[](const Object& val) return (*inserted).second; } else { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::operator[](val) parameter not in map"); } } else { From d6f3b6deb6263c16a3bb732d5e52f01ab3f98135 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 23:26:12 +0200 Subject: [PATCH 54/82] Include all predicate hypothese in output Before one hypothesis was randomly chosen. --- .../core/AnalysisDumpers/BowGeneration.cpp | 181 ++++++++++-------- .../core/AnalysisDumpers/BowGeneration.h | 2 +- 2 files changed, 97 insertions(+), 86 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index b978962ea..4614fb66e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -681,15 +681,17 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs } if (toKeep) { - boost::shared_ptr< BoWPredicate > bP=createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway); - if (bP!=0) + for (boost::shared_ptr< BoWPredicate >& bP: createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway)) { -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; -#endif - abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); - // visited.insert(v); - return abstractBowEl; + if (bP!=0) + { + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; + #endif + abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); + // visited.insert(v); + // return abstractBowEl; + } } } } @@ -705,7 +707,7 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs // tagging graph. return them if (!abstractBowEl.empty()) { - return abstractBowEl; +// return abstractBowEl; } const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); @@ -1126,18 +1128,16 @@ boost::shared_ptr< BoWNamedEntity > BowGenerator::createSpecificEntity( } -boost::shared_ptr< BoWPredicate > BowGenerator::createPredicate( +QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, const AnnotationData* annotationData, const LinguisticGraph& anagraph, const LinguisticGraph& posgraph, const uint64_t offset, std::set< LinguisticGraphVertex >& visited, bool keepAnyway) const { DUMPERLOGINIT; #ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate ling:" << lgv << "; annot:" << agv; #endif - boost::shared_ptr< BoWPredicate > bowP(new BoWPredicate()); + QList< boost::shared_ptr< BoWPredicate > > result; Token* token = get(vertex_token, posgraph, lgv); - bowP->setPosition(offset+token->position()); - bowP->setLength(token->length()); // FIXME handle the ambiguous case when there is several class values for the predicate QStringList predicateIds=annotationData->stringAnnotation(agv,Common::Misc::utf8stdstring2limastring("Predicate")).split("|"); @@ -1145,96 +1145,107 @@ boost::shared_ptr< BoWPredicate > BowGenerator::createPredicate( { LERROR << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; } - // FIXME replace the hardcoded VerbNet by a value from configuration - LWARN << "BowGenerator::createPredicate FIXME replace the hardcoded VerbNet by a value from configuration at" << __FILE__ << ", line"<< __LINE__; - LimaString predicate=LimaString("VerbNet.%1").arg(predicateIds.first()); - try + + + // FIXED replace the hardcoded VerbNet by a value from configuration + // LimaString predicate=predicateIds.first(); + // The fix should work only with FrameNet annotations. VerbNet does not assure to have the same + // number of roles in each list as the number of predicates + for (int i = 0 ; i < predicateIds.size(); i++) { - EntityType predicateEntity= Common::MediaticData::MediaticData::single().getEntityType(predicate); -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate The role(s) related to "<< predicate << " is/are "; -#endif - AnnotationGraph annotGraph=annotationData->getGraph(); - AnnotationGraphOutEdgeIt outIt, outIt_end; - boost::tie(outIt, outIt_end) = boost::out_edges(agv, annotationData->getGraph()); - QMultiMap > roles; - const LimaString typeAnnot="SemanticRole"; - for (; outIt != outIt_end; outIt++) + LimaString predicate = predicateIds[i]; + try { - // FIXME handle the ambiguous case when there is several values for each role - const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); - QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); - if (semRoleIds.size()>1) + EntityType predicateEntity= Common::MediaticData::MediaticData::single().getEntityType(predicate); + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate The role(s) related to "<< predicate << " is/are "; + #endif + AnnotationGraph annotGraph=annotationData->getGraph(); + AnnotationGraphOutEdgeIt outIt, outIt_end; + boost::tie(outIt, outIt_end) = boost::out_edges(agv, annotationData->getGraph()); + QMultiMap > roles; + const LimaString typeAnnot="SemanticRole"; + for (; outIt != outIt_end; outIt++) { - LERROR << "BowGenerator::createPredicate Role has" << semRoleIds.size() << "values:" << semRoleIds; - } - // FIXME replace the hardcoded VerbNet by a value from configuration - LimaString semRole = LimaString("VerbNet.%1").arg(semRoleIds.first()); - LDEBUG << semRole; - try - { - EntityType semRoleEntity = Common::MediaticData::MediaticData::single().getEntityType(semRole); - std::set< LinguisticGraphVertex > posGraphSemRoleVertices = annotationData->matches("annot", semRoleVx, "PosGraph"); - if (!posGraphSemRoleVertices.empty()) + // FIXME handle the ambiguous case when there is several values for each role + const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); + QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); + + if (semRoleIds.size()>1) { - LinguisticGraphVertex posGraphSemRoleVertex = *(posGraphSemRoleVertices.begin()); - if (posGraphSemRoleVertex == lgv) + LERROR << "BowGenerator::createPredicate Role has" << semRoleIds.size() << "values:" << semRoleIds; + } + LimaString semRole = semRoleIds[i]; + LDEBUG << semRole; + try + { + EntityType semRoleEntity = Common::MediaticData::MediaticData::single().getEntityType(semRole); + std::set< LinguisticGraphVertex > posGraphSemRoleVertices = annotationData->matches("annot", semRoleVx, "PosGraph"); + if (!posGraphSemRoleVertices.empty()) { - LERROR << "BowGenerator::createPredicate role vertex is the same as the trigger vertex. Abort this role."; - continue; + LinguisticGraphVertex posGraphSemRoleVertex = *(posGraphSemRoleVertices.begin()); + if (posGraphSemRoleVertex == lgv) + { + LERROR << "BowGenerator::createPredicate role vertex is the same as the trigger vertex. Abort this role."; + continue; + } + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Calling createAbstractBoWElement on PoS graph vertex" << posGraphSemRoleVertex; + #endif + std::vector, boost::shared_ptr< AbstractBoWElement > > > semRoleTokens = createAbstractBoWElement(posGraphSemRoleVertex, anagraph,posgraph, offset, annotationData, visited, keepAnyway); + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Created "<< semRoleTokens.size()<<"token for the role associated to " << predicate; + #endif + // if (semRoleTokens[0].second!="") + if (!semRoleTokens.empty()) + { + roles.insert(semRoleEntity, semRoleTokens[0].second); + } } -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Calling createAbstractBoWElement on PoS graph vertex" << posGraphSemRoleVertex; -#endif - std::vector, boost::shared_ptr< AbstractBoWElement > > > semRoleTokens = createAbstractBoWElement(posGraphSemRoleVertex, anagraph,posgraph, offset, annotationData, visited, keepAnyway); -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Created "<< semRoleTokens.size()<<"token for the role associated to " << predicate; -#endif -// if (semRoleTokens[0].second!="") - if (!semRoleTokens.empty()) + else { - roles.insert(semRoleEntity, semRoleTokens[0].second); + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Found no matching for the semRole in the annot graph"; + #endif } } - else + catch (const Lima::LimaException& e) { -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Found no matching for the semRole in the annot graph"; -#endif + LERROR << "BowGenerator::createPredicate Unknown semantic role" << semRole << ";" << e.what(); } } - catch (const Lima::LimaException& e) - { - LERROR << "BowGenerator::createPredicate Unknown semantic role" << semRole << ";" << e.what(); - } - } - bowP->setPredicateType(predicateEntity); - Common::MediaticData::EntityType pEntityType=bowP->getPredicateType(); - LDEBUG << "BowGenerator::createPredicate Created a Predicate for the verbal class " << Common::MediaticData::MediaticData::single().getEntityName(pEntityType); - if (!roles.empty()) - { - bowP->setRoles(roles); - QMultiMap >pRoles=bowP->roles(); - for (auto it = pRoles.begin(); - it != pRoles.end(); it++) + boost::shared_ptr< BoWPredicate > bowP(new BoWPredicate()); + bowP->setPosition(offset+token->position()); + bowP->setLength(token->length()); + bowP->setPredicateType(predicateEntity); + Common::MediaticData::EntityType pEntityType=bowP->getPredicateType(); + LDEBUG << "BowGenerator::createPredicate Created a Predicate for the verbal class " << Common::MediaticData::MediaticData::single().getEntityName(pEntityType); + if (!roles.empty()) { - boost::shared_ptr< BoWToken> outputRoles=boost::dynamic_pointer_cast(it.value()); - if (outputRoles != 0) + bowP->setRoles(roles); + QMultiMap >pRoles=bowP->roles(); + for (auto it = pRoles.begin(); + it != pRoles.end(); it++) { - LimaString roleLabel=Common::MediaticData::MediaticData::single().getEntityName(it.key()); -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Associated "<< QString::fromUtf8(outputRoles->getOutputUTF8String().c_str()) << " to it" << "via the semantic role label "<< roleLabel ; -#endif + boost::shared_ptr< BoWToken> outputRoles=boost::dynamic_pointer_cast(it.value()); + if (outputRoles != 0) + { + LimaString roleLabel=Common::MediaticData::MediaticData::single().getEntityName(it.key()); + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Associated "<< QString::fromUtf8(outputRoles->getOutputUTF8String().c_str()) << " to it" << "via the semantic role label "<< roleLabel ; + #endif + } } } + result.append(bowP); + } + catch (const Lima::LimaException& e) + { + LERROR << "BowGenerator::createPredicate Unknown predicate" << predicate << ";" << e.what(); + return QList< boost::shared_ptr< BoWPredicate > >(); } - return bowP; - } - catch (const Lima::LimaException& e) - { - LERROR << "BowGenerator::createPredicate Unknown predicate" << predicate << ";" << e.what(); - return boost::shared_ptr< BoWPredicate >(); } + return result; } boost::shared_ptr< BoWPredicate > BowGenerator::createPredicate( diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h index 87e0da543..703487b33 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h @@ -213,7 +213,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT BowGenerator // Common::BagOfWords::BoWPredicate* createPredicate(const Common::MediaticData::EntityType& t, QMultiMap roles) const; - boost::shared_ptr< Common::BagOfWords::BoWPredicate > createPredicate(const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, + QList< boost::shared_ptr< Common::BagOfWords::BoWPredicate > > createPredicate(const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, const Common::AnnotationGraphs::AnnotationData* annotationData, const LinguisticGraph& anagraph, const LinguisticGraph& posgraph, From c0fb94de67de1e6481d930a42e20008759ffa7d7 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 23:28:37 +0200 Subject: [PATCH 55/82] Implement less cluttering error handling --- .../KnowledgeBasedSemanticRoleLabeler.cpp | 113 ++++++++++++++---- 1 file changed, 87 insertions(+), 26 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index c515050d1..a89c27786 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -43,6 +43,14 @@ using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; using namespace Lima::Common::XMLConfigurationFiles; using namespace Lima::Common::Misc; + +#define HANDLE_ERROR(Y,Z) if ( Y ) Z ; +#define HANDLE_ERROR_EQUAL(X,Y,Z) if ( X == Y ) Z ; +#define HANDLE_ERROR_RETURN(X,Y,Z) if ( X ) { Y ; return Z; } +#define HANDLE_ERROR_EQUAL_RETURN(X,Y,Z,R) if ( X == Y ) { Z ; return R ; } +#define HANDLE_ERROR_DIFFERENT(X,Y,Z) if ( X != Y ) Z ; +#define HANDLE_ERROR_DIFFERENT_RETURN(X,Y,Z,R) if ( X != Y ) { Z ; return R ; } + namespace Lima { namespace LinguisticProcessing @@ -85,6 +93,21 @@ KnowledgeBasedSemanticRoleLabeler::~KnowledgeBasedSemanticRoleLabeler() delete m_d; } +auto failed_to_import_the_sys_module = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import the sys module"; + PyErr_Print(); +}; + +auto cannot_instantiate_the_semanticrolelabeler_python_class = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; + PyErr_Print(); + Py_Exit(1); +}; + void KnowledgeBasedSemanticRoleLabeler::init( Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, Manager* manager) @@ -209,12 +232,8 @@ void KnowledgeBasedSemanticRoleLabeler::init( PyObject* main_module = PyImport_ImportModule("__main__"); PyObject* main_dict = PyModule_GetDict(main_module); PyObject* sys_module = PyImport_ImportModule("sys"); - if (sys_module == NULL) - { - LERROR << "Failed to import the sys module"; - PyErr_Print(); - } - PyObject* sys_dict = PyModule_GetDict(sys_module); + HANDLE_ERROR_EQUAL (sys_module, NULL, failed_to_import_the_sys_module() ); + PyDict_SetItemString(main_dict, "sys", sys_module); // Add the path to the knowledgesrl pachkage to putho path @@ -236,15 +255,40 @@ void KnowledgeBasedSemanticRoleLabeler::init( } // Create the semantic role labeller instance - m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[s]", QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData()); - if (m_d->m_instance == NULL) - { - LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; - PyErr_Print(); - Py_Exit(1); - } + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[ss]", + QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData(), + QString("--frame-lexicon=%1").arg(mode).toUtf8().constData()); + HANDLE_ERROR_EQUAL(m_d->m_instance,NULL,cannot_instantiate_the_semanticrolelabeler_python_class()) } +auto metadata_equal_zero = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; +}; + +auto temporary_file_not_open = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to create temporary file"; +}; + +auto temporary_file_srl_not_open = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file for dumping SRL CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); +}; + +auto failed_to_load_data_from_temporary_file = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file" << temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." << temporaryFile->fileName(); + temporaryFile->setAutoRemove(false); +}; + LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( AnalysisContent& analysis) const { @@ -253,17 +297,14 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( LINFO << "start SRL process"; LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); - if (metadata == 0) { - LERROR << "no LinguisticMetaData ! abort"; - return MISSING_DATA; - } + HANDLE_ERROR_EQUAL_RETURN(metadata,0,metadata_equal_zero(),MISSING_DATA) QScopedPointer temporaryFile; if (!m_d->m_temporaryFileMetadata.isEmpty()) { QScopedPointer otherTemp(new QTemporaryFile()); temporaryFile.swap(otherTemp); - temporaryFile->open(); + HANDLE_ERROR_RETURN(!temporaryFile->open(),temporary_file_not_open(),CANNOT_OPEN_FILE_ERROR); metadata->setMetaData(m_d->m_temporaryFileMetadata.toUtf8().constData(), temporaryFile->fileName().toUtf8().constData()); } @@ -293,7 +334,14 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( } else { - temporaryFile->open(); + if (!temporaryFile->open()) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file after dumping CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return CANNOT_OPEN_FILE_ERROR; + } conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); temporaryFile->close(); } @@ -331,17 +379,30 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( } else { - temporaryFile->open(); - temporaryFile->seek(0); - temporaryFile->write(result); + HANDLE_ERROR_RETURN( !temporaryFile->open(), + temporary_file_srl_not_open(temporaryFile), CANNOT_OPEN_FILE_ERROR); + if (!temporaryFile->seek(0)) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to seek to the beginning of temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + if (temporaryFile->write(result) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to write SRL result to temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } temporaryFile->close(); } + Py_DECREF(callResult); // Import the CoNLL result returnCode=m_d->m_loader->process(analysis); - if (returnCode!=SUCCESS_ID) { - LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file"; - return returnCode; - } + HANDLE_ERROR_DIFFERENT_RETURN(returnCode,SUCCESS_ID,failed_to_load_data_from_temporary_file(temporaryFile),returnCode) return returnCode; From 37c86002d898f2e87d5ebc89dd97cc54c6aae604 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 28 Apr 2016 23:30:23 +0200 Subject: [PATCH 56/82] Allow to load FrameNet output from SRL Before, only VerbNet was supported --- .../SemanticRoleLabelingLoader.cpp | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index ba6fb7279..a60d3c452 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -123,6 +123,7 @@ class SemanticRoleLabelingLoaderPrivate MediaId m_language; std::string m_graph; + QString m_model; }; @@ -130,7 +131,8 @@ class SemanticRoleLabelingLoaderPrivate //*********************************************************************** SemanticRoleLabelingLoaderPrivate::SemanticRoleLabelingLoaderPrivate(): m_language(0), -m_graph("PosGraph") +m_graph("PosGraph"), +m_model("VerbNet") {} SemanticRoleLabelingLoaderPrivate::~SemanticRoleLabelingLoaderPrivate() @@ -162,6 +164,11 @@ void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfig m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); } catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_model = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("model").c_str()); + } + catch (NoSuchParam& ) {} // keep default value } @@ -221,9 +228,16 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co #ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process there is/are " << cHandler.m_verbalClassNb << "verbal class(es) for this sentence " ; #endif - for (int vClassIndex=0;vClassIndexm_model + "." + verbalClass; + } + LimaString verbalClass= verbalClasses.join("|"); + AnnotationGraphVertex annotPredicateVertex=annotationData->createAnnotationVertex(); annotationData->addMatching("PosGraph", posGraphPredicateVertex, "annot", annotPredicateVertex); @@ -236,7 +250,12 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co std::vector >::iterator semRoleIt; for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; - LimaString semanticRole=(*semRoleIt).second; + QStringList semanticRoles = (*semRoleIt).second.split("|"); + for (QString& semanticRole: semanticRoles) + { + semanticRole = m_d->m_model + "." + semanticRole; + } + LimaString semanticRole= semanticRoles.join("|"); AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); AnnotationGraphEdge roleEdge=annotationData->createAnnotationEdge(annotPredicateVertex, annotRoleVertex); From 711c00039e622dd8853e879be3a1d81fde24c6c2 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Fri, 29 Apr 2016 16:08:58 +0200 Subject: [PATCH 57/82] Correct handling of FrameNet SRL input --- .../core/AnalysisDumpers/BowGeneration.cpp | 9 +++------ .../core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp | 3 ++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index 4614fb66e..49b24980d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -1143,7 +1143,7 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( QStringList predicateIds=annotationData->stringAnnotation(agv,Common::Misc::utf8stdstring2limastring("Predicate")).split("|"); if (predicateIds.size()>1) { - LERROR << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; + LDEBUG << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; } @@ -1170,13 +1170,10 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( // FIXME handle the ambiguous case when there is several values for each role const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); - - if (semRoleIds.size()>1) - { - LERROR << "BowGenerator::createPredicate Role has" << semRoleIds.size() << "values:" << semRoleIds; - } + Q_ASSERT(predicateIds.size() == semRoleIds.size()); LimaString semRole = semRoleIds[i]; LDEBUG << semRole; + if (semRole.isEmpty()) continue; try { EntityType semRoleEntity = Common::MediaticData::MediaticData::single().getEntityType(semRole); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index a60d3c452..7076ca7ce 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -253,7 +253,8 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co QStringList semanticRoles = (*semRoleIt).second.split("|"); for (QString& semanticRole: semanticRoles) { - semanticRole = m_d->m_model + "." + semanticRole; + if (!semanticRole.isEmpty()) + semanticRole = m_d->m_model + "." + semanticRole; } LimaString semanticRole= semanticRoles.join("|"); From bee4ddc9763b426e38dfcc73d4dc97396842bc39 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 2 May 2016 10:52:16 +0200 Subject: [PATCH 58/82] Avoid a crash when there is no sentence to dump --- .../core/SemanticAnalysis/ConllDumper.cpp | 54 +++++++++++++------ .../KnowledgeBasedSemanticRoleLabeler.cpp | 15 +++--- .../SemanticRoleLabelingLoader.cpp | 2 +- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp index 53c4bad8c..b415504b4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp @@ -201,29 +201,37 @@ void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructur LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const { - DUMPERLOGINIT; #ifdef DEBUG_LP + DUMPERLOGINIT; LDEBUG << "ConllDumper::process"; #endif - LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); - if (metadata == 0) { - LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; - return MISSING_DATA; + LinguisticMetaData* metadata = static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; + return MISSING_DATA; } AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); - if (annotationData == 0) { - LERROR << "ConllDumper::process no AnnotationData ! abort"; - return MISSING_DATA; + if (annotationData == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no AnnotationData ! abort"; + return MISSING_DATA; } AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph));//est de type PosGraph et non pas AnalysisGraph - if (tokenList==0) { + if (tokenList==0) + { + DUMPERLOGINIT; LERROR << "ConllDumper::process graph " << m_d->m_graph << " has not been produced: check pipeline"; return MISSING_DATA; } LinguisticGraph* graph=tokenList->getGraph(); SegmentationData* sd=static_cast(analysis.getData("SentenceBoundaries")); - if (sd==0) { + if (sd==0) + { + DUMPERLOGINIT; LERROR << "ConllDumper::process no SentenceBoundaries! abort"; return MISSING_DATA; } @@ -241,8 +249,15 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const std::map< LinguisticGraphVertex, std::pair > vertexDependencyInformations; - std::vector::iterator sbItr=(sd->getSegments().begin()); uint64_t nbSentences((sd->getSegments()).size()); + if (nbSentences == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process 0 sentence to process"; + return SUCCESS_ID; + } + + std::vector::iterator sbItr=(sd->getSegments().begin()); #ifdef DEBUG_LP LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; #endif @@ -310,9 +325,7 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const #ifdef DEBUG_LP LDEBUG << "ConllDumper::process relation = " << syntRelName; LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); -#endif LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); -#ifdef DEBUG_LP LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); #endif @@ -429,7 +442,15 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const #ifdef DEBUG_LP LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; #endif - targetConllId=segmentationMapping.find(target)->second; + if (segmentationMapping.find(target) != segmentationMapping.end()) + { + targetConllId=segmentationMapping.find(target)->second; + } + else + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process target" << target << "not found in segmantation mapping"; + } #ifdef DEBUG_LP LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; #endif @@ -582,13 +603,16 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const QMultiMap ConllDumperPrivate::collectPredicateTokens( Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd) { +#ifdef DEBUG_LP DUMPERLOGINIT; - QMap result; +#endif + QMap result; AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); AnalysisGraph* tokenList=static_cast(analysis.getData(m_graph)); if (tokenList==0) { + DUMPERLOGINIT; LERROR << "graph " << m_graph << " has not been produced: check pipeline"; return result; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index a89c27786..6ad7453a4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -289,6 +289,14 @@ auto failed_to_load_data_from_temporary_file = [](QScopedPointer temporaryFile->setAutoRemove(false); }; +auto failure_during_call_of_the_annotate_method_on = [](QString& conllInput) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failure during call of the annotate method on" << conllInput; + PyErr_Print(); + Py_Exit(1); +}; + LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( AnalysisContent& analysis) const { @@ -348,12 +356,7 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( // Run the semantic role labeller PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "s", conllInput.toUtf8().constData()); - if (callResult == NULL) - { - LERROR << "Failed to call the annotate method"; - PyErr_Print(); - Py_Exit(1); - } + HANDLE_ERROR_EQUAL(callResult, NULL, failure_during_call_of_the_annotate_method_on(conllInput)); // Display the SRL result char* result = PyUnicode_AsUTF8(callResult); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index 7076ca7ce..dd074ac27 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -344,7 +344,7 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap #ifdef DEBUG_LP LDEBUG << "ConllHandler::extractSemanticInformation"<<"nb descriptors and roleTargetFieldIndex" << descriptors.size() << roleTargetFieldIndex ; #endif - if (NBCOLSINSRLBEFOREFRAME+roleTargetFieldIndex >= descriptors.size()) + if (NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex >= descriptors.size()) { LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; break; From 69a4a42a8635af4bcff4e7473cc6882ef8410b9d Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 26 May 2016 15:56:58 +0200 Subject: [PATCH 59/82] New parsing constraint --- .../HomoSyntagmaticConstraints.cpp | 91 ++++++++++++++++--- .../HomoSyntagmaticConstraints.h | 34 ++++++- 2 files changed, 112 insertions(+), 13 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp index 06746e98f..8de299724 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp @@ -51,6 +51,7 @@ #include #include #include +#include //using namespace boost; using namespace Lima::Common::MediaticData; @@ -97,6 +98,9 @@ CreateRelationReverseWithRelatedFactory(CreateRelationReverseWithRelatedId); Automaton::ConstraintFunctionFactory CopyRelationsOutOfToFactory(CopyRelationsOutOfToId); +Automaton::ConstraintFunctionFactory +CopyIncomingRelationsToFactory(CopyIncomingRelationsToId); + Automaton::ConstraintFunctionFactory CreateCompoundTenseFactory(CreateCompoundTenseId); @@ -464,13 +468,14 @@ bool RemoveOutRelationFrom::operator()(const AnalysisGraph& graph, CopyRelationsOutOfTo::CopyRelationsOutOfTo(MediaId language, const LimaString& complement): - ConstraintWithRelationComplement(language,complement) + Automaton::ConstraintFunction(language,complement), + m_relations(complement.split(",")) { /* Critical function : comment logging message */ -// SAPLOGINIT; -// LDEBUG << "CopyRelationsOutOfTo::CopyRelationsOutOfTo" << language << complement << m_relation; + SAPLOGINIT; + LDEBUG << "CopyRelationsOutOfTo::CopyRelationsOutOfTo" << language << complement << m_relations; } bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, @@ -481,8 +486,8 @@ bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, /* Critical function : comment logging message */ -// SAPLOGINIT; -// LDEBUG << "CopyRelationsOutOfTo" << v1 << v2; + SAPLOGINIT; + LDEBUG << "CopyRelationsOutOfTo" << v1 << v2; SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); if ( v1 == graph.firstVertex() || v1 == graph.lastVertex() || v2 == graph.firstVertex() || v2 == graph.lastVertex() ) @@ -494,19 +499,80 @@ bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, DependencyGraphVertex dv1 = syntacticData-> depVertexForTokenVertex(v1); DependencyGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = out_edges(dv1, *(syntacticData-> dependencyGraph())); - bool res = true; + bool res = false; for (; it != it_end; it++) { - LinguisticGraphVertex target = syntacticData->tokenVertexForDepVertex(boost::target(*it,*(syntacticData-> dependencyGraph()))); + QString relation = QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(map[*it]).c_str()); + LDEBUG << "CopyRelationsOutOfTo" << relation << m_relations; + if (m_relations.contains(relation)) + { + LDEBUG << "CopyRelationsOutOfTo copying" << relation; + LinguisticGraphVertex target = syntacticData->tokenVertexForDepVertex(boost::target(*it,*(syntacticData-> dependencyGraph()))); - res = syntacticData->relation(v2, target, map[*it]); - if (!res) break; + if (syntacticData->relation(v2, target, map[*it])) + res = true; + } } -// LDEBUG << "CopyRelationsOutOfTo:" << res; - return res; + LDEBUG << "CopyRelationsOutOfTo:" << res; + return true; } +//********************************************************************** + +CopyIncomingRelationsTo::CopyIncomingRelationsTo(MediaId language, + const LimaString& complement): + Automaton::ConstraintFunction(language,complement), + m_relations(complement.split(",")) +{ +/* + Critical function : comment logging message +*/ + SAPLOGINIT; + LDEBUG << "CopyIncomingRelationsTo::CopyIncomingRelationsTo" << language << complement << m_relations; +} + +bool CopyIncomingRelationsTo::operator()(const AnalysisGraph& graph, + const LinguisticGraphVertex& v1, + const LinguisticGraphVertex& v2, + AnalysisContent& analysis) const +{ +/* + Critical function : comment logging message +*/ + SAPLOGINIT; + LDEBUG << "CopyIncomingRelationsTo" << v1 << v2; + SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); + if ( v1 == graph.firstVertex() || v1 == graph.lastVertex() + || v2 == graph.firstVertex() || v2 == graph.lastVertex() ) + { + LDEBUG << "CopyIncomingRelationsTo: false"; + return false; + } + EdgeDepRelTypePropertyMap map = get(edge_deprel_type, *(syntacticData-> dependencyGraph())); + + DependencyGraphVertex dv1 = syntacticData-> depVertexForTokenVertex(v1); + DependencyGraphInEdgeIt it, it_end; + boost::tie(it, it_end) = in_edges(dv1, *(syntacticData-> dependencyGraph())); + bool res = false; + for (; it != it_end; it++) + { + LinguisticGraphVertex source = syntacticData->tokenVertexForDepVertex(boost::source(*it,*(syntacticData-> dependencyGraph()))); + QString relation = QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(map[*it]).c_str()); + LDEBUG << "CopyIncomingRelationsTo" << relation << m_relations; + if (m_relations.contains(relation)) + { + LDEBUG << "CopyIncomingRelationsTo copying" << relation; + if (syntacticData->relation(source, v2, map[*it])) + res = true; + } + } + + LDEBUG << "CopyIncomingRelationsTo:" << res; + return true; +} + + //********************************************************************** @@ -1152,6 +1218,9 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, recoData = new RecognizerData(); analysis.setData("RecognizerData", recoData); } +#ifdef DEBUG_LP + LDEBUG << "CreateCompoundTense setNextVertex:" << newVertex; +#endif recoData->setNextVertex(newVertex); #ifdef DEBUG_LP diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h index e61c6129c..efc33adf4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h @@ -57,6 +57,7 @@ namespace SyntacticAnalysis { #define CreateRelationWithRelatedId "CreateRelationWithRelated" #define CreateRelationReverseWithRelatedId "CreateRelationReverseWithRelated" #define CopyRelationsOutOfToId "CopyRelationsOutOfTo" +#define CopyIncomingRelationsToId "CopyIncomingRelationsTo" #define CreateCompoundTenseId "CreateCompoundTense" #define CreateEasyCompoundTenseId "CreateEasyCompoundTense" #define FindRelationFromId "FindRelationFrom" @@ -243,6 +244,10 @@ class LIMA_SYNTACTICANALYSIS_EXPORT CreateRelationWithRelated : public Automaton * @brief This constraint add in the relations buffer the relations of the given * type from the targets of relations out of v2 of the given types to v1. * + * It allows to draw a relation (of type the last element in the complement + * list) from the target (v1) of the given relations (all except the last in the + * complement list) to the trigger. + * * The complement must be of the form: * "rel2|…|reln,rel1" * with rel1 the type of the relation to create and rel2, …, reln the types of @@ -286,9 +291,9 @@ class LIMA_SYNTACTICANALYSIS_EXPORT RemoveOutRelationFrom : public ConstraintWit }; /** - *@brief Copy all relations out of v1 t relations out of v2. Targets and types are kept. + *@brief Copy all relations out of v1 to relations out of v2. Targets and types are kept. */ -class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public ConstraintWithRelationComplement +class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public Automaton::ConstraintFunction { public: explicit CopyRelationsOutOfTo(MediaId language, @@ -300,6 +305,31 @@ class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public ConstraintWith AnalysisContent& analysis) const; private: + QStringList m_relations; +}; + +/** + *@brief Copy all relations incoming to v1 to relations incoming to of v2. Targets and types are kept. + * + * Used to recopy relations + * - pointing to the first member of a coordination (target of COORD1) to the second member + * (target of COORD2) or + * - pointing to the second member of a coordination (target of COORD2) to the first member + * (target of COORD1) + */ +class LIMA_SYNTACTICANALYSIS_EXPORT CopyIncomingRelationsTo : public Automaton::ConstraintFunction +{ +public: + explicit CopyIncomingRelationsTo(MediaId language, + const LimaString& complement=LimaString()); + ~CopyIncomingRelationsTo() {} + bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v1, + const LinguisticGraphVertex& v2, + AnalysisContent& analysis) const; + +private: + QStringList m_relations; }; /** @brief This constraint creates a TEMPCOMP relation between its two From f22df586835c18c8329737d7429942338a871b59 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 26 May 2016 15:58:37 +0200 Subject: [PATCH 60/82] New eng parsing rules --- .../eng/rules-eng-finalize.txt | 47 +++++++++++++++++ .../eng/simplification-quotes-rules-eng.txt | 50 +++++++++++++++++++ .../eng/sub-automatons-eng.txt | 14 ++++++ .../conf/lima-lp-eng.xml | 33 ++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt create mode 100644 lima_linguisticdata/syntacticAnalysis/eng/simplification-quotes-rules-eng.txt create mode 100644 lima_linguisticdata/syntacticAnalysis/eng/sub-automatons-eng.txt diff --git a/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt b/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt new file mode 100644 index 000000000..2e85779b6 --- /dev/null +++ b/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt @@ -0,0 +1,47 @@ +########################################################### +# +# rules to recopy dependencies pointing to coordinated +# tokens onto the other member of the coordination +# +# Created on Wed May 11 2016 +# by Gael de Chalendar (Gael.de-Chalendar@cea.fr) +# +########################################################### + +set encoding=utf8 +using modex lima-analysis.xml +using groups LinguisticProcessing + +#---------------------------------------------------------------------- +# microcategories classes +#---------------------------------------------------------------------- +use categoriesClassesDeclaration-eng.txt + +#---------------------------------------------------------------------- +# +#---------------------------------------------------------------------- +@ConjCoord:@Tout (@Tout){0-n}:(@Tout){0-n} @Tout:SYNTACTIC_RELATION: ++!SecondUngovernedBy(trigger.1,left.1,"COORD1") ++!SecondUngovernedBy(trigger.1,right.2,"COORD2") ++CopyIncomingRelationsTo(left.1,right.2,"SUJ_V") +=>AddRelationInGraph() +=AddRelationInGraph() +=AddRelationInGraph() +#=Simplify() +=Simplify() +=Simplify() += + + + + @@ -53,6 +57,7 @@ + @@ -470,6 +475,22 @@ + + + + + + + + + + + + + + + + @@ -962,6 +983,18 @@ + + + + + + + + + + + + From 5a216a06957f8e7d3b7c9f70aa0e4ae5e55cb0bc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 26 May 2016 16:08:23 +0200 Subject: [PATCH 61/82] Make the FrameNet Modex available at compile time --- lima_linguisticdata/cmake/LinguisticData.cmake | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake index bac286f46..e2150b140 100644 --- a/lima_linguisticdata/cmake/LinguisticData.cmake +++ b/lima_linguisticdata/cmake/LinguisticData.cmake @@ -357,6 +357,10 @@ macro (LIMA_GENERIC_CONFIGENV _lang) # Add custom command to copy files to execEnv (rules to produce them) # and Add destitation files to lima-execEnv target's dependencies list + CustomCopyFileAndAddExecEnvDependency( + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + ) CustomCopyFileAndAddExecEnvDependency( ${CMAKE_SOURCE_DIR}/SRLIntegration/VerbNet-modex.xml ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml @@ -552,6 +556,17 @@ macro (SPECIFICENTITIESCONFIGENV _subtarget _lang _group) COMMENT "create config env for specific entities rules (${_group}-modex.xml)" VERBATIM ) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + COMMENT "create config env for specific entities rules (FrameNet-modex.xml)" + VERBATIM + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config @@ -625,6 +640,7 @@ macro (SPECIFICENTITIESCONFIGENV _subtarget _lang _group) rules-${_lang}-${_group}-configEnv-${_subtarget} ALL DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/${_group}-modex.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/SpecificEntities-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${_lang}.xml From 11ca9ade0fb57f48a5bc0e46a9d8f462292de658 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 26 May 2016 16:10:50 +0200 Subject: [PATCH 62/82] Corrections on eng tokenizer --- .../LinguisticProcessings/eng/tokenizerAutomaton-eng.tok | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok index 412856c09..baa4bb172 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok @@ -80,6 +80,7 @@ - c_grave / GRAVE (T_WORD_BRK) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line / DELIMITER (T_WORD_BRK) - c_del2|c_dot c_par c_par / DELIMITER (T_PARAGRAPH_BRK) + - c_dot c_b c_dot / DELIMITER (T_WORD_BRK) - c_dot c_dot c_dot = START (T_SENTENCE_BRK) - c_del2|c_dot / DELIMITER (T_SENTENCE_BRK) - c_lowline / ALPHA @@ -98,6 +99,8 @@ (DELIMITER) { - m_eof = END - c_grave = GRAVE (T_WORD_BRK) + - c_b c_dot > DELIMITER + - c_dot c_b > DELIMITER - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|c_dot|m_line = DELIMITER (T_WORD_BRK) - c_5 = INTEGER (T_NUMERIC,T_INTEGER) - c_par = IGNORE (T_SENTENCE_BRK) From 90362f671b2fc7380a6b7995bf0e1642764e7dfc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Fri, 3 Jun 2016 13:59:58 +0200 Subject: [PATCH 63/82] Make compile after cherry-picking --- .../moduleConfigurationStructure.cpp | 4 +-- .../xmlConfigurationFileParser.cpp | 12 ++++----- .../SRLIntegration/CMakeLists.txt | 2 +- .../PosTagger/DynamicSvmToolPosTagger.cpp | 2 +- .../KnowledgeBasedSemanticRoleLabeler.cpp | 2 +- .../SemanticRelationsXmlLogger.cpp | 27 +++++++++---------- 6 files changed, 24 insertions(+), 25 deletions(-) diff --git a/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp b/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp index fac20d6b2..4351377c2 100644 --- a/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp +++ b/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp @@ -110,12 +110,12 @@ string& ModuleConfigurationStructure::getParamValueAtKeyOfGroupNamed(const std:: } catch (NoSuchGroup& nsg) { - LWARN << "Getting param '"<m_configurationFileName.c_str() << std::endl; - LWARN << nsm.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsm.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsm.what() << " " << m_d->m_configurationFileName.c_str(); //not LERROR because user may want the module to be optional -> no error throw; } catch(NoSuchGroup& nsg) { - std::cerr << nsg.what().c_str() << " " << m_d->m_configurationFileName.c_str() << std::endl; - LWARN << nsg.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsg.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsg.what() << " " << m_d->m_configurationFileName.c_str(); throw; } catch(NoSuchParam& nsp) { - std::cerr << nsp.what().c_str() << " " << m_d->m_configurationFileName.c_str() << std::endl; - LWARN << nsp.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsp.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsp.what() << " " << m_d->m_configurationFileName.c_str(); throw; } catch(...) diff --git a/lima_linguisticdata/SRLIntegration/CMakeLists.txt b/lima_linguisticdata/SRLIntegration/CMakeLists.txt index 60bc982b6..8021cec02 100644 --- a/lima_linguisticdata/SRLIntegration/CMakeLists.txt +++ b/lima_linguisticdata/SRLIntegration/CMakeLists.txt @@ -162,7 +162,7 @@ foreach(LANG ${LIMA_LANGUAGES}) add_custom_command( OUTPUT FrameNet-${LANG}.bin - COMMAND compile-rules --resourcesDir=${LIMA_RESOURCES} --configDir=${LIMA_CONF} --language=${LANG} -oFrameNet-${LANG}.bin ${_current} --modex=FrameNet-modex.xml ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules + COMMAND compile-rules --resourcesDir=${CMAKE_BINARY_DIR}/execEnv/resources --configDir=${CMAKE_BINARY_DIR}/execEnv/config --language=${LANG} -oFrameNet-${LANG}.bin ${_current} --modex=FrameNet-modex.xml ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules DEPENDS ${_current} ${DEPENDENCIES} # WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} VERBATIM diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp index 8c923ed56..820002935 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include // LDBL_MIN/MAX #include // log diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index 6ad7453a4..5986265a1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -29,7 +29,7 @@ #include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/tools/FileUtils.h" +//#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp index ab0857772..c35328c5a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp @@ -29,11 +29,11 @@ #include "SemanticRelationAnnotation.h" #include "SemanticAnnotation.h" -// #include "common/linguisticData/linguisticData.h" -#include "common/misc/strwstrtools.h" -#include "common/misc/traceUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/Data/strwstrtools.h" +#include "common/time/traceUtils.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/annotationGraph/AnnotationData.h" +//#include "common/annotationGraph/AnnotationData.h" #include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" @@ -52,7 +52,7 @@ namespace Lima { namespace LinguisticProcessing { namespace SemanticAnalysis { -SimpleFactory +SimpleFactory semanticRelationsXmlLoggerFactory(SEMANTICRELATIONSXMLLOGGER_CLASSID); SemanticRelationsXmlLogger::SemanticRelationsXmlLogger() : @@ -72,7 +72,7 @@ void SemanticRelationsXmlLogger::init( { AbstractLinguisticLogger::init(unitConfiguration,manager); - m_language=manager->getInitializationParameters().language; + m_language=manager->getInitializationParameters().media; try { @@ -152,13 +152,13 @@ process(AnalysisContent& analysis) const for (; itv != itv_end; itv++) { LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv; - if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation"))) + if (annotationData->hasAnnotation(*itv,("SemanticAnnotation"))) { // LDEBUG << " it has SemanticRelationAnnotation"; const SemanticAnnotation* annot = 0; try { - annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation")) + annot = annotationData->annotation(*itv,("SemanticAnnotation")) .pointerValue(); } catch (const boost::bad_any_cast& e) @@ -182,14 +182,14 @@ process(AnalysisContent& analysis) const for (; it != it_end; it++) { LDEBUG << "SemanticRelationsXmlLogger on annotation edge " << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()); - if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation"))) + if (annotationData->hasAnnotation(*it,("SemanticRelation"))) { SEMLOGINIT; LDEBUG << "found semantic relation"; const SemanticRelationAnnotation* annot = 0; try { - annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation")) + annot = annotationData->annotation(*it,("SemanticRelation")) .pointerValue(); } catch (const boost::bad_any_cast& e) @@ -247,9 +247,8 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, // otherwise, its type is "token" std::string type("token"); - std::set< uint32_t > matches = annotationData->matches(m_graph,v,"annot"); - for (std::set< uint32_t >::const_iterator it = matches.begin(); - it != matches.end(); it++) + auto matches = annotationData->matches(m_graph,v,"annot"); + for (auto it = matches.begin(); it != matches.end(); it++) { if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { const SpecificEntityAnnotation* annot = 0; @@ -262,7 +261,7 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, LERROR << "This annotation is not a SemanticAnnotation"; continue; } - type=Common::Misc::limastring2utf8stdstring(Common::LinguisticData::LinguisticData::single().getEntityName(annot->getType())); + type=Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityName(annot->getType())); break; } } From cbb25db1cf7d14180bc7c0de82cc63a17a5ce6d2 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 9 Jun 2016 16:29:07 +0200 Subject: [PATCH 64/82] Call embedded srl with language to use --- .../SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index 5986265a1..a4ea18620 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -355,7 +355,10 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( } // Run the semantic role labeller - PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "s", conllInput.toUtf8().constData()); + PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "ss", + conllInput.toUtf8().constData(), + metadata->getMetaData("Lang").c_str() + ); HANDLE_ERROR_EQUAL(callResult, NULL, failure_during_call_of_the_annotate_method_on(conllInput)); // Display the SRL result From 754b625c6d709afcd3294ab16ff582651c7b652f Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 13 Jun 2016 22:35:16 +0200 Subject: [PATCH 65/82] Merge --- .../conf/lima-lp-fre.xml | 305 +++++++++++------- .../KnowledgeBasedSemanticRoleLabeler.cpp | 51 +-- 2 files changed, 216 insertions(+), 140 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index 543ac5d0f..f5e3f6a52 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -1,9 +1,19 @@ + + + + + + + + + + @@ -28,7 +38,8 @@ - + + @@ -38,7 +49,7 @@ - + @@ -50,7 +61,7 @@ - + @@ -59,11 +70,15 @@ + + + + + - @@ -74,17 +89,8 @@ - - - - - - - - - - + @@ -118,7 +124,7 @@ - + @@ -147,7 +153,7 @@ - + @@ -182,23 +188,23 @@ - - - - - + + + + + - + - + @@ -318,12 +324,12 @@ - + - + @@ -355,7 +361,7 @@ - + @@ -410,10 +416,10 @@ - @@ -426,11 +432,11 @@ - - - - - + + + + + @@ -485,7 +491,16 @@ - @@ -619,18 +634,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -646,9 +661,41 @@ - + + + + + + + + + + + + + + + + @@ -656,18 +703,39 @@ - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + @@ -677,18 +745,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -699,18 +767,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -720,18 +788,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -741,18 +809,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -773,18 +841,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -793,10 +861,6 @@ - - - - @@ -908,6 +972,9 @@ + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index 5986265a1..c21a42cbc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -151,28 +151,29 @@ void KnowledgeBasedSemanticRoleLabeler::init( // optional parameter: keep default value (empty) } - try { - m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " - << (int)language << " !"; - throw InvalidConfiguration(); - } + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + try { + m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } - try { - m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " - << (int)language << " !"; - throw InvalidConfiguration(); + try { + m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } } - - QString path; - QString mode = "VerbNet"; QString kbsrlLogLevel = "error"; - try { kbsrlLogLevel = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("loglevel").c_str()); @@ -182,6 +183,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( // keep default } + QString path; try { path = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("path").c_str()); @@ -193,6 +195,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( throw InvalidConfiguration(); } + QString mode = "VerbNet"; try { mode = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("mode").c_str()); @@ -238,7 +241,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( // Add the path to the knowledgesrl pachkage to putho path PyObject* pythonpath = PySys_GetObject("path"); - if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("/home/gael/Projets/knowledgesrl/src")) == -1) + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault(path.toUtf8().constData())) == -1) { LERROR << "Failed to append to python path"; PyErr_Print(); @@ -249,7 +252,8 @@ void KnowledgeBasedSemanticRoleLabeler::init( PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); if (semanticrolelabeler_module == NULL) { - LERROR << "Failed to import srl semanticrolelabeler module"; + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler::init"<< __FILE__ << __LINE__ << ": Failed to import srl semanticrolelabeler module"; PyErr_Print(); Py_Exit(1); } @@ -351,6 +355,11 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( return CANNOT_OPEN_FILE_ERROR; } conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); +#ifdef DEBUG_LP + temporaryFile->setAutoRemove(false); + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler: keeping temporary file after dumping CoNLL data to it for debugging"<< temporaryFile->fileName(); +#endif temporaryFile->close(); } From b98bf2014821c31becc593e18d2483df7d77397c Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 13 Jun 2016 22:37:14 +0200 Subject: [PATCH 66/82] Merge --- lima_linguisticprocessing/conf/lima-lp-fre.xml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index f5e3f6a52..3c700979b 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -136,11 +136,13 @@ + - + - + + From 2ddeb960f6aa306c1939338311293e68af9c4065 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 30 Jun 2016 11:45:51 +0200 Subject: [PATCH 67/82] Transmit language to embeded python srl --- .../SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index c21a42cbc..6013a9f85 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -259,9 +259,10 @@ void KnowledgeBasedSemanticRoleLabeler::init( } // Create the semantic role labeller instance - m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[ss]", + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[sss]", QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData(), - QString("--frame-lexicon=%1").arg(mode).toUtf8().constData()); + QString("--frame-lexicon=%1").arg(mode).toUtf8().constData(), + QString("--language=%1").arg(Lima::Common::MediaticData::MediaticData::single().getMediaId(language).c_str()).toUtf8().constData()); HANDLE_ERROR_EQUAL(m_d->m_instance,NULL,cannot_instantiate_the_semanticrolelabeler_python_class()) } From 504a4283aa2ac6327a75bfb1c83fff0442d956f3 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Tue, 26 Jul 2016 14:42:02 +0200 Subject: [PATCH 68/82] Correct micro category in Modex --- lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml index 289adff19..df31a51b0 100644 --- a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml +++ b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml @@ -124,7 +124,7 @@ - + From f80548b1c82a7f19e9f057f3c2cc17f5162e4b76 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 1 Oct 2015 16:56:21 +0200 Subject: [PATCH 69/82] Correct tokenization of fre ordinal integers --- .../LinguisticProcessings/fre/tokenizerAutomaton-fre.tok | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok index 06b98cd19..4419f3f6c 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok @@ -326,6 +326,12 @@ - c_all > START - m_eof = END } +(ORDINAL_INTEGER2) { + - c_all > ORDINAL_INTEGER1 +} +(ORDINAL_INTEGER1) { + - c_all > ORDINAL_INTEGER +} (ORDINAL_INTEGER) { - c_hyphen c_a_t c_hyphen = TEUPHOT (T_ALPHA) - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) From e8cd69f344834034565d998726070657016ad927 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Mon, 11 Apr 2016 13:58:15 +0200 Subject: [PATCH 70/82] Correct lemma+norm of unknown tokens with digits --- .../fre/tokenizerAutomaton-fre.chars.tok | 20 +- .../conf/lima-lp-eng.xml | 3 +- .../conf/lima-lp-fre.xml | 3 +- .../conf/lima-lp-tva-eng.xml | 3 +- .../conf/lima-lp-tva-fre.xml | 3 +- .../data/test-fre.default.xml | 12 +- .../data/test-fre.simpleword.xml | 400 +++++++++--------- 7 files changed, 233 insertions(+), 211 deletions(-) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok index 7498a74fe..c223c911c 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok @@ -63,16 +63,16 @@ chars { 002D, HYPHEN-MINUS, m_pattern ; 002E, FULL STOP, c_dot ; 002F, SOLIDUS, c_slash ; -0030, DIGIT ZERO, c_5 ; -0031, DIGIT ONE, c_5 ; -0032, DIGIT TWO, c_5 ; -0033, DIGIT THREE, c_5 ; -0034, DIGIT FOUR, c_5 ; -0035, DIGIT FIVE, c_5 ; -0036, DIGIT SIX, c_5 ; -0037, DIGIT SEVEN, c_5 ; -0038, DIGIT EIGHT, c_5 ; -0039, DIGIT NINE, c_5 ; +0030, DIGIT ZERO, c_5, u0030 ; +0031, DIGIT ONE, c_5, u0031 ; +0032, DIGIT TWO, c_5, u0032 ; +0033, DIGIT THREE, c_5, u0033 ; +0034, DIGIT FOUR, c_5, u0034 ; +0035, DIGIT FIVE, c_5, u0035 ; +0036, DIGIT SIX, c_5, u0036 ; +0037, DIGIT SEVEN, c_5, u0037 ; +0038, DIGIT EIGHT, c_5, u0038 ; +0039, DIGIT NINE, c_5, u0039 ; 003A, COLON, c_del2 ; 003B, SEMICOLON, c_del2 ; 003C, LESS-THAN SIGN, c_del1 ; diff --git a/lima_linguisticprocessing/conf/lima-lp-eng.xml b/lima_linguisticprocessing/conf/lima-lp-eng.xml index f9a468617..1010e17c1 100644 --- a/lima_linguisticprocessing/conf/lima-lp-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-eng.xml @@ -406,8 +406,9 @@ - + + diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index 3c700979b..f47230360 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -357,8 +357,9 @@ - + + diff --git a/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml b/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml index d68c430a7..d6d40a984 100644 --- a/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml @@ -146,8 +146,9 @@ - + + diff --git a/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml b/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml index fe33a8266..c4a238758 100644 --- a/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml @@ -143,8 +143,9 @@ - + + diff --git a/lima_linguisticprocessing/data/test-fre.default.xml b/lima_linguisticprocessing/data/test-fre.default.xml index 882f2d1ac..e2fa83a61 100644 --- a/lima_linguisticprocessing/data/test-fre.default.xml +++ b/lima_linguisticprocessing/data/test-fre.default.xml @@ -20,6 +20,16 @@ left="XPATH#//data_structure/vertex[token/position=4]/data/unknown_word//p[@prop='MACRO']/@val" operator="contains" right="NC"/> + + @@ -87,6 +97,7 @@ right="euritrack"/> + EURITRACK est un mot inconnu, doit être normalisé 'euritrack' @@ -94,7 +105,6 @@ - EURITRACK est un mot inconnu, doit être normalisé 'euritrack' test recherche de mot dans le dico : vérification présence nappes, fn : nappe. - + @@ -37,183 +41,187 @@ test recherche de mot dans le dico : vérification présence nappes, fn : napper. - + + + + + + + + + + + eleve + + + + + + + + + + + + éleve + + + + + + + + + + + + elève + + + + + + + + + + + + éléve + + + + + + + + + + + + Frère + + + + + + + + + + + Frere + + + + + + + + + + + Amedée + + + + + + + + + + + marche + + + + + + + + + + + evenement + + + + + + + + + + + createur + + - - - - - - - - - eleve - - - - - - - - - - - - éleve - - - - - - - - - - - - elève - - - - - - - - - - - - éléve - - - - - - - - - - - - Frère - - - - - - - - - - - Frere - - - - - - - - - - - Amedée - - - - - - - - - - - marche - - - - - - - - - - - evenement - - - - - - - - - - - createur - - - @@ -329,23 +337,23 @@ operator="=" right="à-propos"/> - - - - - - - - - - - + + + + + + + + + + + From 7e84e5a8868c8ffbd778f33a567a2244a9ebe63f Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 27 Jul 2016 10:59:55 +0200 Subject: [PATCH 71/82] Use inflected form as lemma if it is empty Solves ANT'inno issue 30. --- .../BagOfWords/bowBinaryReaderWriter.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index 13f87a833..79138d7bd 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -341,6 +341,13 @@ void BoWBinaryReaderPrivate::readSimpleToken(std::istream& file, #ifdef DEBUG_LP LDEBUG << "BoWBinaryReader::readSimpleToken read infl: " << inflectedForm; #endif + if (lemma.isEmpty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::readSimpleToken empty lemma, using inflected form instead:" << inflectedForm; +#endif + lemma = inflectedForm; + } LinguisticCode category; uint64_t position,length; category=static_cast(Misc::readCodedInt(file)); @@ -604,9 +611,22 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, { #ifdef DEBUG_LP BOWLOGINIT; - LDEBUG << "BoWBinaryWriter::writeSimpleToken write lemma: " << &file << token->getLemma(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken write lemma:" << &file << token->getLemma(); +#endif + if (!token->getLemma().isEmpty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken non-empty lemma"; +#endif + Misc::writeUTF8StringField(file,token->getLemma()); + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken empty lemma, writing inflected form instead:" << token->getInflectedForm(); #endif - Misc::writeUTF8StringField(file,token->getLemma()); + Misc::writeUTF8StringField(file,token->getInflectedForm()); + } #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken write infl: " << token->getInflectedForm(); #endif From 9c2351018c54df08d9c9445da38dfc37462a4f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romaric=20Besan=C3=A7on?= Date: Mon, 27 Jun 2016 15:36:31 +0200 Subject: [PATCH 72/82] corrected generation of bow named entity: no not generate a token duplicate to the NE. --- .../core/AnalysisDumpers/BowGeneration.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index 49b24980d..c67c306f8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -233,6 +233,7 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< BoW const AnnotationData* annotationData, std::set< LinguisticGraphVertex >& visited) const { + #ifdef DEBUG_LP DUMPERLOGINIT; LDEBUG << "BowGenerator::buildTermFor annot:" << vx << "; pointing on annot:"<, boost::shared_ptr< Abs LDEBUG << "BowGenerator::createAbstractBoWElement " << v << " has " << anaVertices.size() << " matching vertices in analysis graph"; #endif + bool createdSpecificEntity(false); + // note: anaVertices size should be 0 or 1 for ( AnnotationGraphVertex anaVertex : anaVertices) { @@ -611,6 +614,7 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs se->setVertex(v); abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),se)); // visited.insert(v); + createdSpecificEntity=true; break; } } @@ -717,7 +721,14 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs std::set > alreadyCreated; std::pair predNormCode = std::make_pair(StringsPoolIndex(0),LinguisticCode(0)); - + + if (createdSpecificEntity) { + // a specific entity has been created on the analysis graph: do not output a token + // (RB: do that here so that the vertex on the posgraph can also be analyzed: should test is this is + // needed or if we only need to place the return just after the creation of the named entity) + return abstractBowEl; + } + if (data!=0) { for (auto it=data->begin(); it!=data->end(); it++) From 41ed630776900cac045912bbd8a2baa2787b2efc Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 27 Jul 2016 11:31:53 +0200 Subject: [PATCH 73/82] Avoid setting an empty string as lemma. --- .../core/MorphologicAnalysis/DefaultProperties.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp index 5619d3e28..56aab114a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp @@ -153,10 +153,7 @@ LimaStatusCode DefaultProperties::process( } else if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()) { - LimaString str; -// elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[currentToken->stringForm()]; -// LimaString str = m_charChart->toLower(currentToken->stringForm()); - elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[str]; + elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[currentToken->stringForm()]; } elem.normalizedForm=elem.lemma; elem.type=UNKNOWN_WORD; From 99f3653d4c83d020a0b9cfc320e90d91b242b16a Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 27 Jul 2016 14:08:29 +0200 Subject: [PATCH 74/82] Use enum class for BoWType --- .../common/BagOfWords/AbstractBoWElement.h | 24 ++++++++++++++++--- .../common/BagOfWords/BoWPredicate.h | 2 +- .../BagOfWords/bowBinaryReaderWriter.cpp | 18 +++++++------- .../common/BagOfWords/bowNamedEntity.h | 2 +- .../common/BagOfWords/bowTerm.h | 2 +- .../common/BagOfWords/bowToken.cpp | 15 ++++++++++-- .../common/BagOfWords/bowTokenIterator.cpp | 16 ++++++------- .../common/BagOfWords/bowXMLWriter.cpp | 16 ++++++------- .../common/BagOfWords/indexElement.cpp | 10 ++++---- .../BagOfWords/indexElementIterator.cpp | 20 ++++++++-------- .../BagOfWords/tests/BagOfWordsTest2.cpp | 20 ++++++++-------- .../core/AnalysisDumpers/GenericXmlDumper.cpp | 12 +++++----- .../tools/common/getLexiconFromBoW.cpp | 6 ++--- .../tools/normalize/normalizeTerm.cpp | 4 ++-- 14 files changed, 98 insertions(+), 69 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h index c441840d5..e24388193 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h @@ -47,7 +47,11 @@ namespace BagOfWords { /** * enum to characterize the type of the AbstractBoWElement */ -typedef enum { +#ifndef WIN32 +enum class BoWType : unsigned short { +#else +public enum class BoWType : unsigned short { +#endif BOW_NOTYPE, /**< the AbstractBoWElement is an abstract one that should not be instanciated */ BOW_TOKEN, /**< the AbstractBoWElement is a simple token */ @@ -55,7 +59,21 @@ typedef enum { BOW_NAMEDENTITY, /**< the AbstractBoWElement is a named entity */ BOW_PREDICATE, /**< the AbstractBoWElement is a predicate (n-ary relation, template or semantic frame */ -} BoWType; +}; + +uint8_t toInt(const BoWType& bt); + +template +T& operator<<(T& qd, const BoWType& bt) +{ + if (bt == BoWType::BOW_NOTYPE) qd << "BOW_NOTYPE"; + else if (bt == BoWType::BOW_TOKEN) qd << "BOW_TOKEN"; + else if (bt == BoWType::BOW_TERM) qd << "BOW_TERM"; + else if (bt == BoWType::BOW_NAMEDENTITY) qd << "BOW_NAMEDENTITY"; + else if (bt == BoWType::BOW_PREDICATE) qd << "BOW_PREDICATE"; + else qd << "UNDEFINED"; + return qd; +}; /** * This class is the abstract base class of all elements that can be stored in @@ -79,7 +97,7 @@ class LIMA_BOW_EXPORT AbstractBoWElement * a predicate, 1 for a simple token, n for complex tokens */ virtual uint64_t size(void) const = 0; - virtual BoWType getType() const {return BOW_NOTYPE;} + virtual BoWType getType() const {return BoWType::BOW_NOTYPE;} virtual Lima::LimaString getString(void) const = 0; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h index b1f1cae9a..3a5d3bd31 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h @@ -65,7 +65,7 @@ class LIMA_BOW_EXPORT BoWPredicate : public AbstractBoWElement MediaticData::EntityType getPredicateType(void) const; void setPredicateType(const MediaticData::EntityType&); - virtual BoWType getType() const { return BOW_PREDICATE; } + virtual BoWType getType() const { return BoWType::BOW_PREDICATE; } virtual Lima::LimaString getString(void) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index 79138d7bd..d71259873 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -283,19 +283,19 @@ boost::shared_ptr< AbstractBoWElement > BoWBinaryReaderPrivate::readBoWToken( st #endif boost::shared_ptr< AbstractBoWElement > token; switch (type) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { token=boost::shared_ptr< BoWToken >( new BoWToken); readSimpleToken(file, boost::dynamic_pointer_cast(token)); break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { token=boost::shared_ptr< BoWTerm >(new BoWTerm); // LDEBUG << "BoWToken: calling read(file) on term"; readSimpleToken(file,boost::dynamic_pointer_cast(token)); readComplexTokenParts(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { token=boost::shared_ptr< BoWNamedEntity >(new BoWNamedEntity); // LDEBUG << "BoWToken: calling read(file) on NE"; readSimpleToken(file,boost::dynamic_pointer_cast(token)); @@ -303,7 +303,7 @@ boost::shared_ptr< AbstractBoWElement > BoWBinaryReaderPrivate::readBoWToken( st readNamedEntityProperties(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ token=boost::shared_ptr< BoWPredicate >(new BoWPredicate); readPredicate(file,boost::dynamic_pointer_cast(token)); break; @@ -570,25 +570,25 @@ void BoWBinaryWriterPrivate::writeBoWToken( std::ostream& file, const boost::sha BOWLOGINIT; LDEBUG << "BoWBinaryWriter::writeBoWToken token type is " << token->getType() << &file; #endif - Misc::writeOneByteInt(file,token->getType()); + Misc::writeOneByteInt(file,toInt(token->getType())); switch (token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { writeSimpleToken(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { writeSimpleToken(file,boost::dynamic_pointer_cast(token)); writeComplexTokenParts(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { boost::shared_ptr< BoWNamedEntity > ne=boost::dynamic_pointer_cast(token); writeSimpleToken(file,boost::dynamic_pointer_cast(token)); writeComplexTokenParts(file,ne); writeNamedEntityProperties(file,ne); break; } - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ writePredicate(file,boost::dynamic_pointer_cast(token)); break; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h index 60e90dcc9..6c0304384 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h @@ -76,7 +76,7 @@ class LIMA_BOW_EXPORT BoWNamedEntity : public BoWComplexToken void setFeature(const std::string& attribute, const LimaString& value); - virtual BoWType getType() const { return BOW_NAMEDENTITY; } + virtual BoWType getType() const { return BoWType::BOW_NAMEDENTITY; } /** * get a string of the features, of the kind : diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h index 436dc32f9..9bb94c7f9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h @@ -65,7 +65,7 @@ class LIMA_BOW_EXPORT BoWTerm : public BoWComplexToken BoWTerm& operator=(const BoWTerm&); - virtual BoWType getType() const { return BOW_TERM; } + virtual BoWType getType() const { return BoWType::BOW_TERM; } /** get a string of the BoWToken for output function */ virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager = 0) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp index 7fbe39a9c..dc031c41f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp @@ -52,6 +52,17 @@ namespace Common namespace BagOfWords { +uint8_t toInt(const BoWType& bt) +{ + if (bt == BoWType::BOW_NOTYPE) return 0; + else if (bt == BoWType::BOW_TOKEN) return 1; + else if (bt == BoWType::BOW_TERM) return 2; + else if (bt == BoWType::BOW_NAMEDENTITY) return 3; + else if (bt == BoWType::BOW_PREDICATE) return 4; + else return std::numeric_limits::max(); +} + + #define DEFAULT_SEPARATOR L'#' #define DEFAULT_COMPOUND_SEPARATOR L'_' @@ -287,7 +298,7 @@ void BoWToken::setCategory(LinguisticCode c) {m_d->m_category = c;}; void BoWToken::setPosition(const uint64_t pos){m_d->m_position = pos;}; void BoWToken::setLength(const uint64_t len) {m_d->m_length = len;}; -BoWType BoWToken::getType() const { return BOW_TOKEN; } +BoWType BoWToken::getType() const { return BoWType::BOW_TOKEN; } uint64_t BoWToken::getVertex() const {return m_d->m_vertex;} void BoWToken::setVertex(uint64_t vertex) {m_d->m_vertex = vertex;} @@ -384,7 +395,7 @@ Common::Misc::PositionLengthList BoWToken::getPositionLengthList() const bool BoWToken::operator==(const BoWToken& t) const { - if ((getType()==BOW_NAMEDENTITY) && (t.getType()==BOW_NAMEDENTITY)) + if ((getType()==BoWType::BOW_NAMEDENTITY) && (t.getType()==BoWType::BOW_NAMEDENTITY)) { const BoWNamedEntity* n1=dynamic_cast(this); const BoWNamedEntity* n2=dynamic_cast(&t); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp index 061c7e08a..83dc3534c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp @@ -191,13 +191,13 @@ boost::shared_ptr< AbstractBoWElement > BoWTokenIterator::getElement() { } else { switch ((*m_d->m_iterator)->getType()) { - case BOW_PREDICATE: - case BOW_TOKEN: { + case BoWType::BOW_PREDICATE: + case BoWType::BOW_TOKEN: { return *m_d->m_iterator; break; } - case BOW_TERM: - case BOW_NAMEDENTITY: { + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: { // element itself will be stored in queue as part m_d->storePartsInQueue(boost::dynamic_pointer_cast< BoWToken >(*m_d->m_iterator)); return m_d->m_partQueue.front().getBoWToken(); @@ -282,7 +282,7 @@ bool BoWTokenIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWToken return false; } // addInPartQueue(token,false); - if (token->getType()==BOW_NAMEDENTITY + if (token->getType()==BoWType::BOW_NAMEDENTITY && m_iterateThroughNamedEntitiesParts==DO_NOT_ITERATE_THROUGH_NAMEDENTITIES_PARTS) { PartTokens pt; @@ -292,14 +292,14 @@ bool BoWTokenIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWToken } switch (token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { // push simple token in parts partTokens.push_back(PartTokens()); partTokens.back().push_back(token); break; } - case BOW_TERM: - case BOW_NAMEDENTITY: { + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: { boost::shared_ptr< BoWComplexToken > complexToken=boost::dynamic_pointer_cast(token); if (complexToken->size() == 1) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp index 2173caea0..bcc5ee0c8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp @@ -359,10 +359,10 @@ void BoWXMLWriterPrivate::writeIndexElement( << "\" length=\"" << element.getLength() << "\""; if (element.isNamedEntity()) { m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; - m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; + m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; } else { - m_outputStream << " type=\"" << BOW_TOKEN << "\""; + m_outputStream << " type=\"" << BoWType::BOW_TOKEN << "\""; } m_outputStream << "/>" << endl; return; @@ -371,10 +371,10 @@ void BoWXMLWriterPrivate::writeIndexElement( // compound if (element.isNamedEntity()) { m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; - m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; + m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; } else { - m_outputStream << " type=\"" << BOW_TERM << "\""; + m_outputStream << " type=\"" << BoWType::BOW_TERM << "\""; } m_outputStream << ">" << endl << m_spaces << " " << endl; @@ -403,7 +403,7 @@ void BoWXMLWriterPrivate::writeBoWToken( m_currentTokId++; const BoWToken* tok = 0; switch(token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { tok = static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(tok->getCategory())); @@ -416,7 +416,7 @@ void BoWXMLWriterPrivate::writeBoWToken( << "/>" << std::endl; break; } - case BOW_PREDICATE: { + case BoWType::BOW_PREDICATE: { const BoWPredicate* term=static_cast(token); m_outputStream <" << std::endl; break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { const BoWTerm* term=static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(term->getCategory())); @@ -447,7 +447,7 @@ void BoWXMLWriterPrivate::writeBoWToken( m_outputStream <" << std::endl; break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { const BoWNamedEntity* ne=static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(ne->getCategory())); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp index 8cf7f87cc..abec11f2a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp @@ -82,7 +82,7 @@ class IndexElementPrivate IndexElementPrivate::IndexElementPrivate(): m_id(0), -m_type(BOW_NOTYPE), +m_type(BoWType::BOW_NOTYPE), m_word(), m_category(0), m_position(0), @@ -238,11 +238,11 @@ uint64_t IndexElement::getId() const { return m_d->m_id; } Lima::Common::BagOfWords::BoWType IndexElement::getType() const { return m_d->m_type; } -bool IndexElement::isSimpleTerm() const { return m_d->m_type == BOW_TOKEN; } +bool IndexElement::isSimpleTerm() const { return m_d->m_type == BoWType::BOW_TOKEN; } -bool IndexElement::isComposedTerm() const { return m_d->m_type == BOW_TERM; } +bool IndexElement::isComposedTerm() const { return m_d->m_type == BoWType::BOW_TERM; } -bool IndexElement::isPredicate() const { return m_d->m_type == BOW_PREDICATE; } +bool IndexElement::isPredicate() const { return m_d->m_type == BoWType::BOW_PREDICATE; } const LimaString& IndexElement::getSimpleTerm() const { return m_d->m_word; } @@ -252,7 +252,7 @@ uint64_t IndexElement::getPosition() const { return m_d->m_position; } uint64_t IndexElement::getLength() const { return m_d->m_length; } -bool IndexElement::isNamedEntity() const { return m_d->m_type == BOW_NAMEDENTITY; } +bool IndexElement::isNamedEntity() const { return m_d->m_type == BoWType::BOW_NAMEDENTITY; } const Common::MediaticData::EntityType& IndexElement::getNamedEntityType() const { return m_d->m_neType; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp index 9597eddfb..c1100429d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp @@ -208,7 +208,7 @@ IndexElement IndexElementIterator::getElement() switch ((*m_d->m_iterator)->getType()) { - case BOW_TOKEN: + case BoWType::BOW_TOKEN: { #ifdef DEBUG_CD LDEBUG << "IndexElementIterator::getElement simple token:" << token->getIdUTF8String(); @@ -226,7 +226,7 @@ IndexElement IndexElementIterator::getElement() } return m_d->m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]; } - case BOW_TERM: + case BoWType::BOW_TERM: #ifdef DEBUG_CD LDEBUG << "IndexElementIterator::getElement term:" << token->getIdUTF8String(); #endif @@ -244,7 +244,7 @@ IndexElement IndexElementIterator::getElement() m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),m_d->m_partQueue.front()); return m_d->m_partQueue.front(); - case BOW_NAMEDENTITY: + case BoWType::BOW_NAMEDENTITY: #ifdef DEBUG_CD LDEBUG << "IndexElementIterator::getElement named entity:" << boost::dynamic_pointer_cast(*m_d->m_iterator)->getIdUTF8String() ;//<< Lima::Common::MediaticData::MediaticData::single().getEntityName(static_cast((*m_d->m_iterator))->getNamedEntityType()); // element itself will be stored in queue as part @@ -257,7 +257,7 @@ IndexElement IndexElementIterator::getElement() return m_d->m_partQueue.front(); // FIXME Change the handling of predicates to take into account their complex structure nature - case BOW_PREDICATE: + case BoWType::BOW_PREDICATE: { predicate = boost::dynamic_pointer_cast((*m_d->m_iterator)); uint64_t id=m_d->m_idGenerator->getId(predicate->getString()); @@ -270,7 +270,7 @@ IndexElement IndexElementIterator::getElement() predicate->getPredicateType() ); } - case BOW_NOTYPE: + case BoWType::BOW_NOTYPE: return IndexElement(); } } @@ -424,7 +424,7 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT bool result = false; switch (token->getType()) { - case BOW_TOKEN: + case BoWType::BOW_TOKEN: { #ifdef DEBUG_CD LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue simple token:" << token->getIdUTF8String(); @@ -452,12 +452,12 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT ids_rel=make_pair(vector(1,m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())].getId()),rel); return result; } - case BOW_NAMEDENTITY: + case BoWType::BOW_NAMEDENTITY: neType=boost::dynamic_pointer_cast(token)->getNamedEntityType(); break; - case BOW_TERM: - case BOW_PREDICATE: - case BOW_NOTYPE: + case BoWType::BOW_TERM: + case BoWType::BOW_PREDICATE: + case BoWType::BOW_NOTYPE: default:; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp index c3481c5b6..6ea8c1c95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp @@ -34,7 +34,7 @@ void BagOfWordsTest2::test_indexElementDefaultConstructor() // IndexElement(); IndexElement el; QVERIFY(el.getId() == 0); - QVERIFY(el.getType() == BOW_NOTYPE); + QVERIFY(el.getType() == BoWType::BOW_NOTYPE); QVERIFY(el.getSimpleTerm() == ""); QVERIFY(el.getCategory() == 0); QVERIFY(el.getPosition() == 0); @@ -60,7 +60,7 @@ void BagOfWordsTest2::test_indexElementConstructor1() // const Common::MediaticData::EntityType neType=Common::MediaticData::EntityType(), // const uint64_t reType=0); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -69,7 +69,7 @@ void BagOfWordsTest2::test_indexElementConstructor1() IndexElement el(id,type,word,cat,position,length,neType); QVERIFY(el.getId() == id); - QVERIFY(el.getType() == BOW_TOKEN); + QVERIFY(el.getType() == BoWType::BOW_TOKEN); QVERIFY(el.getSimpleTerm() == word); QVERIFY(el.getCategory() == cat); QVERIFY(el.getPosition() == position); @@ -93,14 +93,14 @@ void BagOfWordsTest2::test_indexElementConstructor2() // const Common::MediaticData::EntityType neType=Common::MediaticData::EntityType(), // const uint64_t reType=0); uint64_t id = 2; - BoWType type = BOW_TERM; + BoWType type = BoWType::BOW_TERM; std::vector structure; std::vector relations; EntityType neType = EntityType(); IndexElement el(id,type,structure,relations,neType); QVERIFY(el.getId() == id); - QVERIFY(el.getType() == BOW_TERM); + QVERIFY(el.getType() == BoWType::BOW_TERM); QVERIFY(el.getSimpleTerm().isEmpty()); QVERIFY(el.getCategory() == 0); QVERIFY(el.getPosition() == 0); @@ -119,7 +119,7 @@ void BagOfWordsTest2::test_indexElementCopyConstructor() qDebug() << "BagOfWordsTest2::test_indexElementCopyConstructor"; // IndexElement(const IndexElement& ie); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -147,7 +147,7 @@ void BagOfWordsTest2::test_indexElementCopyConstructor() delete el; el = 0; // Test members after deleting original objects QVERIFY(el_copy.getId() == 1); - QVERIFY(el_copy.getType() == BOW_TOKEN); + QVERIFY(el_copy.getType() == BoWType::BOW_TOKEN); QVERIFY(el_copy.getSimpleTerm() == "word"); QVERIFY(el_copy.getCategory() == 0); QVERIFY(el_copy.getPosition() == 0); @@ -166,7 +166,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() qDebug() << "BagOfWordsTest2::test_indexElementCopyConstructor"; // IndexElement(const IndexElement& ie); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -175,7 +175,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() IndexElement* el = new IndexElement(id,type,word,cat,position,length,neType); uint64_t id2 = 2; - BoWType type2 = BOW_TERM; + BoWType type2 = BoWType::BOW_TERM; QString word2 = QString::fromUtf8("other"); uint64_t cat2 = 1; uint64_t position2 = 10; @@ -202,7 +202,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() delete el; el = 0; // Test members after deleting original objects QVERIFY(el2.getId() == 1); - QVERIFY(el2.getType() == BOW_TOKEN); + QVERIFY(el2.getType() == BoWType::BOW_TOKEN); QVERIFY(el2.getSimpleTerm() == "word"); QVERIFY(el2.getCategory() == 0); QVERIFY(el2.getPosition() == 0); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp index 5dc610043..80a68b9c1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp @@ -742,12 +742,12 @@ xmlOutputCompound(std::ostream& out, DUMPERLOGINIT; LDEBUG << "GenericXmlDumper: output BoWToken [" << token->getOutputUTF8String() << "]"; switch (token->getType()) { - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ // FIXME To implement - LERROR << "GenericXmlDumper: BOW_PREDICATE support not implemented"; + LERROR << "GenericXmlDumper: BoWType::BOW_PREDICATE support not implemented"; break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { LDEBUG << "GenericXmlDumper: output BoWTerm"; // compound informations out << "<" << m_compoundTag; @@ -793,7 +793,7 @@ xmlOutputCompound(std::ostream& out, } break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { if (m_outputCompoundParts) { LinguisticGraphVertex v=boost::dynamic_pointer_cast(token)->getVertex(); LDEBUG << "GenericXmlDumper: output BoWNamedEntity of vertex " << v; @@ -809,7 +809,7 @@ xmlOutputCompound(std::ostream& out, } break; } - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { if (m_outputCompoundParts) { LinguisticGraphVertex v=boost::dynamic_pointer_cast(token)->getVertex(); LDEBUG << "GenericXmlDumper: output BoWToken of vertex " << v; @@ -819,7 +819,7 @@ xmlOutputCompound(std::ostream& out, } default: { DUMPERLOGINIT; - LERROR << "GenericXmlDumper: Error: BowToken has type BOW_NOTYPE"; + LERROR << "GenericXmlDumper: Error: BowToken has type BoWType::BOW_NOTYPE"; } } diff --git a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp index 093fe0601..d6f6d36c8 100644 --- a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp +++ b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp @@ -165,12 +165,12 @@ LimaString getStringDecomp(boost::shared_ptr< BoWToken > token) { std::deque< BoWComplexToken::Part >::const_iterator it, it_end; boost::shared_ptr< BoWComplexToken> complexToken; switch (token->getType()) { - case BOW_TOKEN: + case BoWType::BOW_TOKEN: //cerr << "token is a simple token -> " << token->getString() << endl; return token->getString(); break; - case BOW_TERM: - case BOW_NAMEDENTITY: + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: //cerr << "token is a complex token" << endl; complexToken=boost::dynamic_pointer_cast(token); it=complexToken->getParts().begin(); it_end=complexToken->getParts().end(); diff --git a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp index 2e6afdc7e..a9271d3f3 100644 --- a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp +++ b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp @@ -305,7 +305,7 @@ multimap extractNormalization(const LimaString& source,const bowItr!=bowText.end(); bowItr++) { - if ((*bowItr)->getType() != BOW_PREDICATE) + if ((*bowItr)->getType() != BoWType::BOW_PREDICATE) { pair posLen=getStartEnd(static_cast(&**bowItr)); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; @@ -326,7 +326,7 @@ multimap extractNormalization(const LimaString& source,const pair getStartEnd(const BoWToken* tok) { pair res; - if (tok->getType()==BOW_TOKEN) + if (tok->getType()==BoWType::BOW_TOKEN) { res.first=tok->getPosition(); res.second=tok->getPosition()+tok->getLength(); From 383616a988a5eb92de52ddf68a3a2152b1589f84 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 7 Sep 2016 22:15:50 +0200 Subject: [PATCH 75/82] Solves partly bug #50 --- .../fre/Numex/NUMBER-fre.rules | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules b/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules index bb50c0d43..434b7bdca 100644 --- a/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules +++ b/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules @@ -198,6 +198,36 @@ LE:::NOT_NUMBER: # In sport scores like "1 - 1", each integer is a number and the - is not a minus sign @NumForm:[@NumForm] [(+|-)]::NUMBER:=>NormalizeNumber() + +@Decimal=(t_comma_number,t_dot_number) +@SmallDecimalGroup=(t_integer<100) +@IntegerGroup=(t_integer>99<1000) +@LargeNumber=(t_integer>1000) + +### Numbers in digits + +# 1 234.5 +# 12 345.6 +# 12 345 678.9 +# but also errors like: 12 345 6.7 +@SmallDecimalGroup:(+|-)?:@IntegerGroup{0-3} (@Decimal|@IntegerGroup) @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 123 456.7 +@IntegerGroup:(+|-)?:@IntegerGroup{0-3} (@Decimal|@IntegerGroup) @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 12 +@SmallDecimalGroup:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 12345 +# 1234 millions +@LargeNumber:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + + +# 12.3 +# 123.4 +# 12345.6 +@Decimal:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + @NumForm:(+|-)?:(@NumForm|@Number)? \%?:NUMBER:=>NormalizeNumber() -@Number:(+|-)?:@Number{0-3} \%?:NUMBER:=>NormalizeNumber() +@Number:(+|-)?:@Number{0-5} @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() @OrdNumber:(+|-)?:\%?:NUMBER:=>NormalizeNumber() From 26273ce49f301ac69118175a3298170a01ff17c0 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 8 Sep 2016 10:20:51 +0200 Subject: [PATCH 76/82] Add tva test for issue #50 --- .../data/test-fre.se.xml | 65 ++++++++++++++----- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/lima_linguisticprocessing/data/test-fre.se.xml b/lima_linguisticprocessing/data/test-fre.se.xml index 36d663664..1a2131c08 100644 --- a/lima_linguisticprocessing/data/test-fre.se.xml +++ b/lima_linguisticprocessing/data/test-fre.se.xml @@ -682,22 +682,22 @@ left="XPATH#//specific_entities/specific_entity[position=28][length=12]/type" operator="=" right="DateTime.DATE"/> - + + TIMEX : 25 + + + + + + + + + + @@ -1000,12 +1000,12 @@ right="Numex.NUMEX"/> comment="pourcentage" left="XPATH#//specific_entities/specific_entity[position=37][length=4]/type" operator="=" -right="Numex.NUMEX"/> +right="Numex.NUMBER"/> +right="Numex.NUMBER"/> @@ -1043,5 +1043,34 @@ operator="=" operator="=" right="Numex.NUMEX"/> + + + NUMEX: test consecutive numbers. See issue #50 on github + + + + + + + + + + + + + + From 848d7e62d56a47860febce54f2b6d1c528da44a4 Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 28 Sep 2016 10:48:50 +0200 Subject: [PATCH 77/82] Correct positions in xml analysis --- .../core/AnalysisDumpers/BowDumper.cpp | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp index ca7bfa31c..1aa6ec36e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp @@ -186,7 +186,32 @@ LimaStatusCode BowDumper::process( bowText.lang=metadata->getMetaData("Lang"); buildBoWText(annotationData, syntacticData, bowText,analysis,anagraph,posgraph); - BoWBinaryWriter writer(handler->shiftFrom()); + // Exclude from the shift list XML entities preceding the offset and + // readjust positions regarding the beginning of the node being analyzed + uint64_t offset = metadata->getStartOffset(); + QMap localShiftFrom; + const auto& globalShiftFrom = handler->shiftFrom(); + if (!globalShiftFrom.isEmpty()) + { + uint64_t diff = 0; + // start first loop at second position + auto it=globalShiftFrom.constBegin()+1; + for (; it!=globalShiftFrom.constEnd(); ++it) + { + if (it.key()+(it-1).value() >= offset) + break; + diff = it.value(); + } + // rewind by one to not miss the first entity and then + // continue from where we stoped the shift corrections + for (it = it -1; it!=globalShiftFrom.constEnd(); ++it) + if (it.value() > diff) + { + // empirical correction but seems to work + localShiftFrom.insert(it.key()+diff, it.value()-diff); + } + } + BoWBinaryWriter writer(localShiftFrom); DumperStream* dstream=initialize(analysis); #ifdef DEBUG_LP From 448e214f22801dcac8c6debeb380115ff5e4e3ab Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 28 Sep 2016 11:30:35 +0200 Subject: [PATCH 78/82] Correct term lengths taking entities into account --- .../BagOfWords/bowBinaryReaderWriter.cpp | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index d71259873..a922319a8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -632,6 +632,60 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, #endif Misc::writeUTF8StringField(file,token->getInflectedForm()); Misc::writeCodedInt(file,token->getCategory()); +#define CORRECTION +#ifdef CORRECTION //<----------------------------------------------------------------------------------------------- + + auto beg = token->getPosition(); + auto end = token->getLength() + beg; + //::std::cout << "beg: " << beg << " end: " << end << ::std::endl; + + if (m_shiftFrom.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom is empty"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin" << beg; + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end" << end; +#endif + auto const it1 = m_shiftFrom.lowerBound(beg-1); + if (it1 == m_shiftFrom.constBegin()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: NO shift"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (it1-1).value(); +#endif + beg += (it1-1).value(); + } + auto const it2 = m_shiftFrom.lowerBound(end-1); + if (it2 == m_shiftFrom.constBegin()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: NO shift"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (it2-1).value(); +#endif + end += (it2-1).value(); + } + } + + Misc::writeCodedInt(file, beg-1); + Misc::writeCodedInt(file, end-beg); + +#else +// code d'origine if (m_shiftFrom.empty()) { #ifdef DEBUG_LP @@ -661,6 +715,7 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, } } Misc::writeCodedInt(file,token->getLength()); +#endif } void BoWBinaryWriter::writePredicate(std::ostream& file, From 7ff5a8f30fcbd67a30e0714524dae81b10920cba Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Wed, 28 Sep 2016 11:38:52 +0200 Subject: [PATCH 79/82] Code refactoring. Remove unused old code and rename some variables. --- .../BagOfWords/bowBinaryReaderWriter.cpp | 52 +++---------------- 1 file changed, 8 insertions(+), 44 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index a922319a8..57acc5470 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -632,12 +632,9 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, #endif Misc::writeUTF8StringField(file,token->getInflectedForm()); Misc::writeCodedInt(file,token->getCategory()); -#define CORRECTION -#ifdef CORRECTION //<----------------------------------------------------------------------------------------------- auto beg = token->getPosition(); auto end = token->getLength() + beg; - //::std::cout << "beg: " << beg << " end: " << end << ::std::endl; if (m_shiftFrom.empty()) { @@ -651,8 +648,8 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin" << beg; LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end" << end; #endif - auto const it1 = m_shiftFrom.lowerBound(beg-1); - if (it1 == m_shiftFrom.constBegin()) + auto const shiftForBeginIt = m_shiftFrom.lowerBound(beg-1); + if (shiftForBeginIt == m_shiftFrom.constBegin()) { #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: NO shift"; @@ -661,12 +658,12 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, else { #ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (it1-1).value(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (shiftForBeginIt-1).value(); #endif - beg += (it1-1).value(); + beg += (shiftForBeginIt-1).value(); } - auto const it2 = m_shiftFrom.lowerBound(end-1); - if (it2 == m_shiftFrom.constBegin()) + auto const shiftForEndIt = m_shiftFrom.lowerBound(end-1); + if (shiftForEndIt == m_shiftFrom.constBegin()) { #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: NO shift"; @@ -675,47 +672,14 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, else { #ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (it2-1).value(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (shiftForEndIt-1).value(); #endif - end += (it2-1).value(); + end += (shiftForEndIt-1).value(); } } Misc::writeCodedInt(file, beg-1); Misc::writeCodedInt(file, end-beg); - -#else -// code d'origine - if (m_shiftFrom.empty()) - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom is empty"; -#endif - Misc::writeCodedInt(file,token->getPosition()-1); - } - else - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from" << token->getPosition(); -#endif - QMap::const_iterator it = m_shiftFrom.lowerBound(token->getPosition()-1); - if (it == m_shiftFrom.constBegin()) - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom NO shift"; -#endif - Misc::writeCodedInt(file,token->getPosition()-1); - } - else - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom shift by" << (it-1).value(); -#endif - Misc::writeCodedInt(file,token->getPosition()+ (it-1).value()-1); - } - } - Misc::writeCodedInt(file,token->getLength()); -#endif } void BoWBinaryWriter::writePredicate(std::ostream& file, From 1433dbdaa4a5493f04828f4e7578c23fa631c71e Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Thu, 29 Sep 2016 09:40:57 +0200 Subject: [PATCH 80/82] Correct shiftFrom taking offset into account. --- .../core/AnalysisDumpers/BowDumper.cpp | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp index 1aa6ec36e..17a555c65 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp @@ -191,6 +191,10 @@ LimaStatusCode BowDumper::process( uint64_t offset = metadata->getStartOffset(); QMap localShiftFrom; const auto& globalShiftFrom = handler->shiftFrom(); +#ifdef DEBUG_LP + LDEBUG << "BowDumper::process offset:" << offset; + LDEBUG << "BowDumper::process globalShiftFrom:" << globalShiftFrom; +#endif if (!globalShiftFrom.isEmpty()) { uint64_t diff = 0; @@ -198,24 +202,42 @@ LimaStatusCode BowDumper::process( auto it=globalShiftFrom.constBegin()+1; for (; it!=globalShiftFrom.constEnd(); ++it) { +#ifdef DEBUG_LP + LDEBUG << "BowDumper::process it.key():"<= offset) break; diff = it.value(); } +#ifdef DEBUG_LP + LDEBUG << "BowDumper::process after shiftFrom loop, diff is:" << diff; +#endif // rewind by one to not miss the first entity and then // continue from where we stoped the shift corrections for (it = it -1; it!=globalShiftFrom.constEnd(); ++it) - if (it.value() > diff) + { +#ifdef DEBUG_LP + LDEBUG << "BowDumper::process it.key():"<= offset && it.value() > diff) { // empirical correction but seems to work localShiftFrom.insert(it.key()+diff, it.value()-diff); } + } } +#ifdef DEBUG_LP + LDEBUG << "BowDumper::process localShiftFrom:" << localShiftFrom; +#endif BoWBinaryWriter writer(localShiftFrom); DumperStream* dstream=initialize(analysis); #ifdef DEBUG_LP - LDEBUG << "BowDumper::process writing BoW text on" << dstream->out(); + LDEBUG << "BowDumper::process writing BoW text on" << dstream->out(); #endif writer.writeBoWText(dstream->out(),bowText); delete dstream; From b1de1eef65df08fa20b89a22cc2311fef476cdaa Mon Sep 17 00:00:00 2001 From: Frederic Wils Date: Wed, 5 Oct 2016 17:36:33 +0200 Subject: [PATCH 81/82] msg --- lima_antinno/src/antinno.ResourcesIdent.h | 101 + .../AbstractFactoryPatternExport.h | 2 +- .../AmosePluginsManager.cpp | 78 +- .../AmosePluginsManager.h | 11 +- .../DynamicLibrariesManager.cpp | 79 +- .../DynamicLibrariesManager.h | 11 +- .../ProcessingClientFactory.h | 8 +- .../common/AbstractFactoryPattern/Singleton.h | 23 +- .../antinno.LibraryLoader.class.cpp | 97 + .../antinno.LibraryLoader.class.h | 15 + .../AbstractProcessingClient.h | 14 +- lima_common/src/common/Data/DataTypes.cpp | 6 +- lima_common/src/common/Data/LimaString.cpp | 16 +- lima_common/src/common/Data/LimaString.h | 9 +- .../common/Data/genericDocumentProperties.cpp | 45 +- .../src/common/Data/tests/FileUtilsTest.cpp | 2 +- .../common/FsaAccess/CompoundStringAccess.h | 4 - .../src/common/FsaAccess/FsaAccessIOHandler.h | 2 +- .../src/common/FsaAccess/FsaAccessSpare16.cpp | 4 - .../common/Handler/AbstractDocumentHandler.h | 22 +- .../Handler/AbstractProcessingClientHandler.h | 27 +- .../Handler/AbstractXmlAnalysisHandler.h | 2 +- lima_common/src/common/LimaCommon.cpp | 130 +- lima_common/src/common/LimaCommon.h | 849 ++++--- .../MediaProcessors/MediaProcessors.cpp | 16 +- .../src/common/MediaticData/EntityType.h | 2 +- .../src/common/MediaticData/mediaData.cpp | 4 +- .../src/common/MediaticData/mediaData.h | 11 +- .../src/common/MediaticData/mediaticData.cpp | 173 +- .../src/common/MediaticData/mediaticData.h | 2 - .../MediaticData/tests/MediaticDataTest.cpp | 112 + .../ProcessUnitFramework/AnalysisContent.cpp | 16 +- .../ProcessUnitFramework/AnalysisContent.h | 13 +- lima_common/src/common/QsLog/QsLog.cpp | 72 +- lima_common/src/common/QsLog/QsLog.h | 62 + .../src/common/QsLog/QsLogCategories.cpp | 426 ++-- .../xmlConfigurationFileParser.cpp | 5 + .../common/misc/AbstractAccessIterators.cpp | 10 - lima_common/src/common/misc/stringspool.cpp | 7 +- lima_common/src/common/misc/stringspool.h | 2 +- .../src/common/time/timeUtilsController.cpp | 5 + lima_common/src/common/time/traceUtils.cpp | 117 +- lima_common/src/common/time/traceUtils.h | 21 +- lima_common/src/common/tools/FileUtils.cpp | 201 ++ lima_common/src/common/tools/FileUtils.h | 130 + lima_common/test/testFsaDict16.cpp | 1873 +++++++------- .../AbstractLinguisticProcessingClient.h | 14 +- .../AnalysisHandlers/BowDocumentHandler.cpp | 2 +- .../AnalysisHandlers/LTRTextHandler.cpp | 87 + .../client/AnalysisHandlers/LTRTextHandler.h | 68 + .../StructuredBoWToBoWDocument.cpp | 4 +- .../StructuredBoWToBoWDocument.h | 4 +- .../LinguisticProcessingClientFactory.cpp | 4 +- .../LinguisticProcessingClientFactory.h | 8 +- .../BagOfWords/AbstractBoWDocumentHandler.h | 6 +- .../common/BagOfWords/AbstractBoWElement.h | 2 +- .../BinaryWriterBoWDocumentHandler.cpp | 4 +- .../BinaryWriterBoWDocumentHandler.h | 4 +- .../TextWriterBoWDocumentHandler.cpp | 64 +- .../BagOfWords/TextWriterBoWDocumentHandler.h | 4 +- .../BagOfWords/bowBinaryReaderWriter.cpp | 93 +- .../common/BagOfWords/bowBinaryReaderWriter.h | 3 +- .../common/BagOfWords/bowComplexToken.cpp | 21 +- .../common/BagOfWords/bowDocument.cpp | 4 - .../common/BagOfWords/bowNamedEntity.h | 2 +- .../common/BagOfWords/bowText.cpp | 2 - .../common/BagOfWords/bowToken.cpp | 14 +- .../common/BagOfWords/bowToken.h | 8 + .../common/BagOfWords/bowXMLWriter.cpp | 103 +- .../common/BagOfWords/bowXMLWriter.h | 23 +- .../common/BagOfWords/indexElement.cpp | 22 +- .../BagOfWords/indexElementIterator.cpp | 13 +- .../PropertyCode/PropertyCodeManager.cpp | 15 +- .../linearTextRepresentation/ltrText.cpp | 3 + .../common/linearTextRepresentation/ltrText.h | 8 + .../common/linguisticData/languageData.cpp | 50 +- .../common/misc/positionLengthList.cpp | 179 +- .../common/misc/positionLengthList.h | 13 + .../common/tgv/TestCaseProcessor.cpp | 3 + .../core/AnalysisDict/DictionaryData.cpp | 319 +-- .../EnhancedAnalysisDictionary.cpp | 9 +- .../core/AnalysisDict/FsaAccessResource.cpp | 318 ++- .../core/AnalysisDict/FsaRwAccessResource.cpp | 31 +- .../MultiLevelAnalysisDictionary.cpp | 5 +- .../core/AnalysisDumpers/BowDumper.cpp | 57 +- .../core/AnalysisDumpers/BowGeneration.cpp | 82 +- .../ConstituantAndRelationExtractor.cpp | 8 +- .../core/AnalysisDumpers/GenericXmlDumper.cpp | 35 +- .../core/AnalysisDumpers/GenericXmlDumper.h | 12 +- .../core/AnalysisDumpers/LTRTextBuilder.cpp | 23 +- .../core/AnalysisDumpers/LTRTextBuilder.h | 9 + .../core/AnalysisDumpers/SimpleXmlDumper.cpp | 35 +- .../core/AnalysisDumpers/StopList.cpp | 7 +- .../AnalysisDumpers/TextFeaturesDumper.cpp | 11 +- .../core/AnalysisDumpers/TextFeaturesDumper.h | 1 + .../AnalysisDumpers/WordFeatureExtractor.cpp | 165 +- .../AnalysisDumpers/WordFeatureExtractor.h | 53 +- .../linearTextRepresentationDumper.cpp | 5 +- .../linearTextRepresentationLogger.cpp | 5 +- .../core/Automaton/EntityFeatures.cpp | 19 + .../core/Automaton/EntityFeatures.h | 35 +- .../Automaton/SpecificEntityAnnotation.cpp | 8 + .../core/Automaton/automaton.cpp | 187 +- .../core/Automaton/automaton.h | 9 +- .../core/Automaton/automatonCommon.cpp | 31 + .../core/Automaton/automatonCommon.h | 6 + .../core/Automaton/automatonReaderWriter.cpp | 41 + .../core/Automaton/entityGroupTransition.cpp | 130 + .../core/Automaton/entityGroupTransition.h | 86 + .../core/Automaton/gazeteerTransition.cpp | 376 +++ .../core/Automaton/gazeteerTransition.h | 112 + .../core/Automaton/recognizer.cpp | 2217 +++++++++-------- .../core/Automaton/recognizerData.h | 10 + .../core/Automaton/recognizerMatch.cpp | 24 + .../core/Automaton/rule.cpp | 2 +- .../Automaton/transitionSearchStructure.h | 12 + .../core/Automaton/transitionUnit.h | 4 +- .../core/CoreLinguisticProcessingClient.cpp | 49 +- .../core/CoreLinguisticProcessingClient.h | 18 +- .../core/CorefSolving/corefSolver.cpp | 2 +- .../CorefSolving/coreferentAnnotation.cpp | 51 +- .../core/CorefSolving/coreferentAnnotation.h | 15 +- .../core/Dictionary/DictionaryCode.cpp | 30 +- .../core/EventAnalysis/EventTemplate.cpp | 27 +- .../core/EventAnalysis/EventTemplate.h | 24 +- .../core/EventAnalysis/EventTemplateData.cpp | 8 + .../core/EventAnalysis/EventTemplateData.h | 1 + .../EventTemplateDataXmlLogger.cpp | 4 +- .../EventTemplateDefinitionResource.cpp | 80 +- .../EventTemplateDefinitionResource.h | 20 +- .../EventTemplateFillingActions.cpp | 10 +- .../EventTemplateFillingActions.h | 2 +- .../EventAnalysis/EventTemplateStructure.cpp | 20 +- .../EventAnalysis/EventTemplateStructure.h | 21 +- .../core/FlatTokenizer/Automaton.cpp | 7 +- .../core/FlatTokenizer/CharChart.cpp | 6 +- .../core/FlatTokenizer/Tokenizer.cpp | 6 +- .../core/FlatTokenizer/TokenizerAutomaton.cpp | 44 - .../LinguisticGraph.h | 2 +- .../MorphoSyntacticData.cpp | 27 - .../MorphoSyntacticData.h | 4 - .../AbstractTextualAnalysisDumper.cpp | 7 +- .../LinguisticProcessors/AnalysisLoader.cpp | 154 ++ .../LinguisticProcessors/AnalysisLoader.h | 99 + .../ExternalProcessUnit.cpp | 220 ++ .../ExternalProcessUnit.h | 100 + .../LinguisticProcessors/StatusLogger.cpp | 10 +- .../core/LinguisticProcessors/StatusLogger.h | 2 +- .../LinguisticResources.cpp | 20 +- .../linguisticProcessing/core/Modex/Modex.cpp | 5 +- .../AbbreviationSplitAlternatives.cpp | 50 +- .../AbbreviationSplitAlternatives.h | 1 + .../AlternativesReader.cpp | 2 +- .../ConcatenatedDataHandler.cpp | 11 +- .../MorphologicAnalysis/DefaultProperties.cpp | 39 +- .../DesagglutinationResources.cpp | 9 +- .../EnchantSpellingAlternatives.cpp | 37 +- .../HyphenWordAlternatives.cpp | 696 +++--- .../OrthographicAlternatives.cpp | 12 +- .../core/MorphologicAnalysis/SimpleWord.cpp | 549 ++-- .../PosTagger/DynamicSvmToolPosTagger.cpp | 16 +- .../core/PosTagger/SvmToolPosTagger.cpp | 11 +- .../core/PosTagger/ViterbiPosTagger.cpp | 7 +- .../core/PosTagger/ViterbiPosTagger.h | 2 +- .../core/PosTagger/ngramMatrices.cpp | 14 +- .../core/RegexMatcher/RegexMatcher.cpp | 1 - .../core/SemanticAnalysis/ConllDumper.cpp | 685 ++++- .../KnowledgeBasedSemanticRoleLabeler.cpp | 498 +++- .../SemanticRelationsXmlLogger.cpp | 314 +++ .../SemanticRelationsXmlLogger.h | 103 + .../SemanticRoleLabelingLoader.cpp | 508 +++- .../SpecificEntities/NormalizeDateTime.cpp | 25 +- .../NormalizeDateTimeResources.cpp | 112 +- .../NormalizeDateTimeResources.h | 16 +- .../SpecificEntitiesConstraints.cpp | 247 +- .../SpecificEntitiesConstraints.h | 59 +- .../SpecificEntitiesLoader.cpp | 9 +- .../SpecificEntitiesMicros.cpp | 14 +- .../SpecificEntitiesRecognizer.cpp | 1 - .../SpecificEntitiesXmlLogger.cpp | 76 +- .../SpecificEntitiesXmlLogger.h | 3 +- .../DotDependencyGraphWriter.cpp | 2 +- .../HomoSyntagmaticConstraints.cpp | 87 +- .../HomoSyntagmaticConstraints.h | 2 +- .../SelectionalPreferences.cpp | 8 +- .../SelectionalRestrictionsConstraints.cpp | 5 + .../SyntacticAnalysisTools.cpp | 4 +- .../SyntacticAnalyzer-chains.cpp | 1634 ++++++------ .../SyntacticAnalyzer-chains.h | 5 + .../SyntacticAnalyzer-deps.cpp | 25 +- .../SyntacticAnalyzer-simplify.cpp | 4 +- .../SyntacticAnalysis/SyntagmaticMatrix.cpp | 6 +- .../SegmentationResultsLoader.cpp | 4 + .../SentenceBoundariesFinder.cpp | 25 + .../core/WordSenseAnalysis/Test.cpp | 2 +- .../test/analyzeText.cpp | 92 +- .../test/analyzetextservercore.cpp | 6 +- .../test/limaServer/AnalysisWrapper.cpp | 14 +- .../test/limaServer/AnalysisWrapper.h | 6 +- .../test/limaServer/LimaDBusServer.cpp | 2 +- .../test/limaServer/LimaServer.cpp | 15 +- .../test/limaServer/LimaServer.h | 3 +- .../test/limaServer/analysisthread.cpp | 2 - .../test/limaServer/main.cpp | 7 +- lima_linguisticprocessing/test/srl.cpp | 5 +- .../tools/applyAutomaton/apply-rules.cpp | 7 +- .../tools/automatonCompiler/compile-rules.cpp | 1155 +++++---- .../automatonCompiler.cpp | 68 +- .../libautomatonCompiler/automatonCompiler.h | 3 + .../libautomatonCompiler/automatonString.cpp | 31 +- .../libautomatonCompiler/automatonString.h | 5 + .../libautomatonCompiler/gazeteer.cpp | 28 +- .../libautomatonCompiler/gazeteer.h | 8 +- .../recognizerCompiler.cpp | 156 +- .../libautomatonCompiler/ruleCompiler.cpp | 4 +- .../libautomatonCompiler/ruleString.cpp | 3 +- .../transitionCompiler.cpp | 135 +- .../libautomatonCompiler/transitionCompiler.h | 22 +- .../libautomatonCompiler/tstring.cpp | 3 +- .../tools/common/catBowFiles.cpp | 2 +- .../tools/common/getLexiconFromBoW.cpp | 12 +- .../tools/common/parseXMLFile.cpp | 4 +- .../tools/common/parseXMLPropertyFile.cpp | 9 +- .../tools/common/readBoWFile.cpp | 6 +- .../tools/common/readLinguisticData.cpp | 4 +- .../tools/common/testAccessMethod.cpp | 4 +- .../tools/common/testContentDict16.cpp | 44 +- .../tools/common/testReadLexicon.cpp | 6 +- .../tools/dictionary/compileDictionary.cpp | 678 ++--- .../tools/dictionary/testComposedDict.cpp | 28 +- .../tools/normalize/desaccent.cpp | 4 +- .../tools/normalize/normalizeTerm.cpp | 68 +- .../tools/tva/AnalysisTestCase.cpp | 2 +- lima_linguisticprocessing/tools/tva/tva.cpp | 74 +- lima_linguisticprocessing/tools/tvr/tvr.cpp | 6 +- 235 files changed, 13273 insertions(+), 6683 deletions(-) create mode 100644 lima_antinno/src/antinno.ResourcesIdent.h create mode 100644 lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp create mode 100644 lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h create mode 100644 lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp create mode 100644 lima_common/src/common/tools/FileUtils.cpp create mode 100644 lima_common/src/common/tools/FileUtils.h create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp create mode 100644 lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h diff --git a/lima_antinno/src/antinno.ResourcesIdent.h b/lima_antinno/src/antinno.ResourcesIdent.h new file mode 100644 index 000000000..39f9d0c93 --- /dev/null +++ b/lima_antinno/src/antinno.ResourcesIdent.h @@ -0,0 +1,101 @@ + +#ifndef ghdghscjenicfhermfuchhmfmaixfxdsqksdogqjefqojxefoejkg +#define ghdghscjenicfhermfuchhmfmaixfxdsqksdogqjefqojxefoejkg + +/* +Code copi de AntResourcesIdent.h + +Sans doute amliorer + +FW 30/10/2013 + + +*/ +// +// C++ Implementation : AntResourcesIdent +// +// Description: analyse les identifiants d'une ressource binaire Ant'inno +// +// Author: Jean-Yves Sage , (C) 2010-2011 +// +// Copyright: See COPYING file that comes with this distribution +// +//////////////////////////////////////////////////////////// +#include +#include +#include + +namespace antinno { + +//brief This class extracts identifiers from binary resources files + +class ResourcesIdent +{ +public: + //------------------------------------------------------------------------------------------------------------------------------ + //param header header in memory + //param headerSize header size (for check) + ResourcesIdent(const char *header, const ::std::size_t headerSize) + : _pHeader(header), _pHeaderSize(headerSize) + { + } + //------------------------------------------------------------------------------------------------------------------------------ + //return string ready to display */ + ::std::string toHumanReadableString() + { + unsigned char *currentPtr = (unsigned char*)_pHeader; + //UNSIGNED indispensable pour calculer les valeurs des entiers + ::std::ostringstream resultoss; + //lit les noms d'identifiants + const ::std::size_t namesSize = _readInt4LE(currentPtr); + const ::std::string names = ::std::string((char*)currentPtr, namesSize); + currentPtr += namesSize; + //lit les valeurs + + const ::std::size_t valuesNb = _readInt4LE(currentPtr) / 4; + ::std::size_t ptrb = 0; + for (::std::size_t i=0; i +#include #ifdef WIN32 diff --git a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp index 93b773ab7..64ce209d1 100644 --- a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp +++ b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp @@ -19,6 +19,7 @@ #include "AmosePluginsManager.h" #include "common/LimaCommon.h" #include "common/AbstractFactoryPattern/DynamicLibrariesManager.h" +#include "common/tools/FileUtils.h" #include #include @@ -26,49 +27,68 @@ using namespace Lima; using namespace Lima::Common; +using namespace Lima::Common::Misc; AmosePluginsManager::AmosePluginsManager() { loadPlugins(); } -bool AmosePluginsManager::loadPlugins() +bool AmosePluginsManager::loadPlugins(const QString& configDirs) { - ABSTRACTFACTORYPATTERNLOGINIT; - LINFO << "AmosePluginsManager::loadPlugins"; +// ABSTRACTFACTORYPATTERNLOGINIT; +// LINFO << "AmosePluginsManager::loadPlugins"; // DynamicLibrariesManager::changeable().addSearchPath("c:\amose\lib");; // open LIMA_CONF/plugins file - QDir pluginsDir(QString::fromUtf8(qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData()) + "/plugins"); - QStringList pluginsFiles = pluginsDir.entryList(QDir::Files); - Q_FOREACH(QString pluginsFile, pluginsFiles) + + QStringList configDirsList = configDirs.split(LIMA_PATH_SEPARATOR); + if (configDirsList.isEmpty()) { -#ifdef DEBUG_CD - LDEBUG << "AmosePluginsManager::loadPlugins loding plugins file " << pluginsFile.toUtf8().data(); + // Look for LIMA_CONF directory. + configDirsList = buildConfigurationDirectoriesList(QStringList() << "lima", QStringList()); + } +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(const QString& configDir, configDirsList) +#else + for(const QString& configDir : configDirsList) #endif - QFile file(pluginsDir.path() + "/" + pluginsFile); - if (!file.open(QIODevice::ReadOnly)) - return false; - // for each entry, call load library - while (!file.atEnd()) + { + // Deduce plugins directory. + QString stdPluginsDir(configDir); + stdPluginsDir.append("/plugins"); + QDir pluginsDir(stdPluginsDir); + + // For each file under plugins directory, read plugins names and deduce shared libraries to load. + QStringList pluginsFiles = pluginsDir.entryList(QDir::Files); + Q_FOREACH(QString pluginsFile, pluginsFiles) { - QByteArray line = file.readLine(); - if (line.endsWith('\n')) line.chop(1); - // Allows empty and comment lines - if ( !line.isEmpty() && !line.startsWith('#') ) +// #ifdef DEBUG_CD +// LDEBUG << "AmosePluginsManager::loadPlugins loading plugins file " << pluginsFile.toUtf8().data(); +// #endif + // Open plugin file. + QFile file(pluginsDir.path() + "/" + pluginsFile); + if (!file.open(QIODevice::ReadOnly)) { + ABSTRACTFACTORYPATTERNLOGINIT; + LERROR << "AmosePluginsManager::loadPlugins: cannot open plugins file " << pluginsFile.toUtf8().data(); + return false; + } + + // For each entry, call load library + while (!file.atEnd()) { -#ifdef WIN32 - QString strline = QString(line.data()).trimmed() + ".dll"; - QString library_path=QString::fromUtf8(qgetenv("LD_LIBRARY_PATH").constData()==0?"c:\amose\lib":qgetenv("LD_LIBRARY_PATH").constData()); - DynamicLibrariesManager::changeable().addSearchPathes( library_path.toUtf8().data()); -#else - QString strline = QString("lib") + line.data() + ".so"; -#endif -#ifdef DEBUG_CD - LDEBUG << "AmosePluginsManager::loadPlugins loading plugin '" << line.data() << "'"; -#endif - DynamicLibrariesManager::changeable().loadLibrary(line.data()); + // Remove whitespace characters from the start and the end. + QString line = QString(file.readLine()).trimmed(); + + // Allow empty and comment lines. + if ( !line.isEmpty() && !line.startsWith('#') ) + { +// #ifdef DEBUG_CD +// LDEBUG << "AmosePluginsManager::loadPlugins loading plugin '" << line.toStdString().c_str() << "'"; +// #endif + DynamicLibrariesManager::changeable().loadLibrary(line.toStdString().c_str()); + } } } } return true; -} +} \ No newline at end of file diff --git a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h index fa43178d2..ca2bf6725 100644 --- a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h +++ b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h @@ -22,6 +22,8 @@ #include "common/AbstractFactoryPattern/AbstractFactoryPatternExport.h" #include "common/AbstractFactoryPattern/Singleton.h" +#include + namespace Lima { @@ -29,12 +31,17 @@ class LIMA_FACTORY_EXPORT AmosePluginsManager : public Singleton { friend class Singleton; +public: + virtual ~AmosePluginsManager() {} + + /** Load plugins in the plugins subdir of the semicolon separated config dirs + * @param configDirs semicolon separated list of config dirs. If empty, loads a default location + */ + bool loadPlugins(const QString& configDirs = ""); private: AmosePluginsManager(); - virtual ~AmosePluginsManager() {} - bool loadPlugins(); }; } diff --git a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp index 95bfbdbcd..aa732038b 100644 --- a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp +++ b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp @@ -31,63 +31,78 @@ #include #include #include -#include +#ifdef ANTINNO_SPECIFIC +// FWI 17/08/2015 : dsactiv car n'existe pas dans QT4 +#else +#include +#endif using namespace std; namespace Lima { namespace Common { -DynamicLibrariesManager::DynamicLibrariesManager() +class DynamicLibrariesManagerPrivate +{ +friend class DynamicLibrariesManager; + DynamicLibrariesManagerPrivate(); + + std::map > m_handles; + // at load time, will try to load the libraries from these paths before the default ones + std::vector m_supplementarySearchPath; +}; + +DynamicLibrariesManagerPrivate::DynamicLibrariesManagerPrivate() : + m_handles(), + m_supplementarySearchPath() +{ +} + + +DynamicLibrariesManager::DynamicLibrariesManager() : m_d(new DynamicLibrariesManagerPrivate()) { } DynamicLibrariesManager::~DynamicLibrariesManager() { - for (std::map::iterator - it=m_handles.begin(),it_end=m_handles.end(); it!=it_end; it++) - { - delete (*it).second; - } } bool DynamicLibrariesManager:: isLoaded(const std::string& libName) { - std::map::const_iterator - it=m_handles.find(libName); - return (it!=m_handles.end()); + auto it=m_d->m_handles.find(libName); + return (it!=m_d->m_handles.end()); } -bool DynamicLibrariesManager:: -loadLibrary(const std::string& libName) +bool DynamicLibrariesManager::loadLibrary(const std::string& libName) { #ifdef DEBUG_CD ABSTRACTFACTORYPATTERNLOGINIT; LDEBUG <<"DynamicLibrariesManager::loadLibrary() -- "<<"libName="<::const_iterator - it=m_handles.find(libName); - if (it!=m_handles.end()) { + auto it=m_d->m_handles.find(libName); + if (it!=m_d->m_handles.end()) { #ifdef DEBUG_CD - LWARN << "DEBUG_CD: trying to reload dynamic library " << libName.c_str(); + LDEBUG << "DynamicLibrariesManager::loadLibrary trying to reload dynamic library" << libName.c_str(); + return false; #endif } - QLibrary* libhandle = 0; + std::shared_ptr< QLibrary > libhandle; // try supplementary search path - for (std::vector::const_iterator it = m_supplementarySearchPath.begin(); it != m_supplementarySearchPath.end(); it++) + for (auto it = m_d->m_supplementarySearchPath.begin(); it != m_d->m_supplementarySearchPath.end(); it++) { #ifdef DEBUG_FACTORIES LDEBUG << "Trying supplementary " << ((*it)+"/"+libName).c_str(); #endif - libhandle = new QLibrary( ((*it)+"/"+libName).c_str() ); + libhandle = std::shared_ptr< QLibrary >(new QLibrary( ((*it)+"/"+libName).c_str() )); libhandle->setLoadHints(QLibrary::ResolveAllSymbolsHint | QLibrary::ExportExternalSymbolsHint); if (libhandle->load()) { - m_handles.insert(std::make_pair(libName,libhandle)); + m_d->m_handles.insert(std::make_pair(libName,libhandle)); #ifdef DEBUG_CD - LDEBUG << "the library " << libName.c_str() << " was loaded"; + LDEBUG << "the library " << libName.c_str() << " was loaded from supplementary search path"; + LDEBUG << "the library fully-qualified name: " << libhandle->fileName(); #endif return true; } @@ -96,8 +111,6 @@ loadLibrary(const std::string& libName) // if ( QLibrary::isLibrary(((*it)+"/"+libName).c_str()) ) ABSTRACTFACTORYPATTERNLOGINIT; LERROR <<"DynamicLibrariesManager::loadLibrary() -- "<<"Failed to open lib " << libhandle->errorString().toUtf8().data(); - delete libhandle; - libhandle = 0; } } // now try system default search path @@ -106,13 +119,14 @@ loadLibrary(const std::string& libName) #ifdef DEBUG_FACTORIES LINFO << "Trying " << libName.c_str(); #endif - libhandle = new QLibrary( libName.c_str() ); + libhandle = std::shared_ptr( new QLibrary( libName.c_str() ) ); libhandle->setLoadHints(QLibrary::ResolveAllSymbolsHint | QLibrary::ExportExternalSymbolsHint); if (libhandle->load()) { - m_handles.insert(std::make_pair(libName,libhandle)); + m_d->m_handles.insert(std::make_pair(libName,libhandle)); #ifdef DEBUG_CD - LDEBUG << "the library " << libName.c_str() << " was loaded"; + LDEBUG << "the library " << libName.c_str() << " was loaded from system default search path"; + LDEBUG << "the library fully-qualified name: " << libhandle->fileName(); #endif return true; } @@ -120,13 +134,11 @@ loadLibrary(const std::string& libName) { ABSTRACTFACTORYPATTERNLOGINIT; LINFO <<"DynamicLibrariesManager::loadLibrary() -- "<< "Failed to open lib " << libhandle->errorString().toUtf8().data(); - delete libhandle; - libhandle = 0; return false; } } else { - m_handles[libName]=libhandle; + m_d->m_handles[libName]=libhandle; #ifdef DEBUG_CD LDEBUG << "the library " << libName.c_str() << " was loaded"; #endif @@ -137,7 +149,7 @@ loadLibrary(const std::string& libName) void DynamicLibrariesManager:: addSearchPath(const std::string& searchPath) { - if(std::find(m_supplementarySearchPath.begin(), m_supplementarySearchPath.end(), searchPath)!=m_supplementarySearchPath.end()){ + if(std::find(m_d->m_supplementarySearchPath.begin(), m_d->m_supplementarySearchPath.end(), searchPath)!=m_d->m_supplementarySearchPath.end()){ return; } #ifdef DEBUG_CD @@ -145,7 +157,7 @@ addSearchPath(const std::string& searchPath) LINFO << "adding search path '"<m_supplementarySearchPath.push_back(searchPath); } @@ -155,7 +167,12 @@ addSearchPathes(QString searchPathes) #ifdef DEBUG_CD ABSTRACTFACTORYPATTERNLOGINIT; #endif +#ifdef ANTINNO_SPECIFIC + // FWI 17/08/2015 : ligne modifie car QRegularExpression n'existe pas dans QT4 + QStringList list = searchPathes.replace("\\","/").split(";", QString::SkipEmptyParts); +#else QStringList list = searchPathes.replace("\\","/").split(QRegularExpression("[;]"), QString::SkipEmptyParts); +#endif for(QStringList::iterator it = list.begin(); it!=list.end();++it) { QString searchPath = *it; diff --git a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h index a3be72de3..fcb9af768 100644 --- a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h +++ b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h @@ -41,16 +41,19 @@ #include #include #include +#include class QString; namespace Lima { namespace Common { +class DynamicLibrariesManagerPrivate; class LIMA_FACTORY_EXPORT DynamicLibrariesManager: public Singleton { friend class Singleton; - public: + +public: ~DynamicLibrariesManager(); bool isLoaded(const std::string& libName); @@ -60,10 +63,8 @@ friend class Singleton; private: DynamicLibrariesManager(); - - std::map m_handles; - // at load time, will try to load the libraries from these paths before the default ones - std::vector m_supplementarySearchPath; + + std::unique_ptr m_d; }; } // end namespace diff --git a/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h b/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h index dfde4b494..8e2fe3957 100644 --- a/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h +++ b/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h @@ -66,7 +66,7 @@ class ProcessingClientFactory * ClientFactory must have been configured before this method is called * Use configureClientFactory() method to configure. */ - virtual AbstractProcessingClient* createClient(const std::string& id) const = 0; + virtual std::shared_ptr< AbstractProcessingClient > createClient(const std::string& id) const = 0; /** * @brief show registered clientId @@ -89,7 +89,7 @@ class ProcessingClientFactoryFactory: public Singleton createProcessingClientFactory(const std::string& id) const ; private: ProcessingClientFactoryFactory() {}; }; @@ -97,8 +97,8 @@ class ProcessingClientFactoryFactory: public Singleton { public: - virtual ~AbstractProcessingClientFactoryFactory() {std::cerr << "~AbstractExtractorFactory()" << std::endl;}; - virtual ProcessingClientFactory* createProcessingClientFactory() const = 0; + virtual ~AbstractProcessingClientFactoryFactory() {}; + virtual std::shared_ptr< ProcessingClientFactory > createProcessingClientFactory() const = 0; protected: AbstractProcessingClientFactoryFactory(const std::string& id): RegistrableFactory(id) {}; diff --git a/lima_common/src/common/AbstractFactoryPattern/Singleton.h b/lima_common/src/common/AbstractFactoryPattern/Singleton.h index 009e2c577..77358757a 100644 --- a/lima_common/src/common/AbstractFactoryPattern/Singleton.h +++ b/lima_common/src/common/AbstractFactoryPattern/Singleton.h @@ -1,5 +1,5 @@ /* - Copyright 2002-2013 CEA LIST + Copyright 2002-2016 CEA LIST This file is part of LIMA. @@ -16,13 +16,10 @@ You should have received a copy of the GNU Affero General Public License along with LIMA. If not, see */ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ #ifndef LIMA_MISC_SINGLETON_H #define LIMA_MISC_SINGLETON_H +#include namespace Lima { @@ -55,20 +52,20 @@ class Singleton private: - static Object* s_instance; + static std::unique_ptr< Object > s_instance; Singleton(const Singleton&) {} }; template -Object* Singleton::s_instance(0); +std::unique_ptr< Object > Singleton::s_instance(new Object()); template const Object& Singleton::single() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } return *s_instance; } @@ -78,9 +75,9 @@ const Object* Singleton::psingle() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } - return s_instance; + return s_instance.get(); } template @@ -88,7 +85,7 @@ Object& Singleton::changeable() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } return *s_instance; } @@ -98,9 +95,9 @@ Object* Singleton::pchangeable() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } - return s_instance; + return s_instance.get(); } } // Lima diff --git a/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp new file mode 100644 index 000000000..a4d24875c --- /dev/null +++ b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp @@ -0,0 +1,97 @@ + +#include "antinno.LibraryLoader.class.h" +#include "common/LimaCommon.h" +#include +#include +#include +#ifdef WIN32 + #define WIN32_LEAN_AND_MEAN + #include + #ifdef ERROR + //#undef ERROR + #endif +#endif + +namespace Lima { namespace antinno { + +#ifdef WIN32 +class SystemMsg +{ +public: + SystemMsg(DWORD msgId) : _msgId(msgId), _lpMsgBuf(NULL) + { + DWORD msgBufLen = ::FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | // max 64K bytes + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, // lpSource (optional) + _msgId, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR) &_lpMsgBuf, + 0, NULL ); + if (msgBufLen != 0) + _s.assign(static_cast<::std::wstring::value_type const*>(_lpMsgBuf)); + else // traitement par dfaut mais perfectible + _s.clear(); + } + ::std::string toUtf8String() const + { + return ::boost::locale::conv::utf_to_utf(_s); + } + ~SystemMsg() + { + LocalFree(_lpMsgBuf); + } +private: + LPVOID _lpMsgBuf; + DWORD _msgId; + ::std::wstring _s; +}; +#else + #error no implementation for non-win32 systems +#endif + +LibraryLoader::LibraryLoader() +{ +} +void LibraryLoader::loadFromFile(::std::string const& filePath) +{ + ABSTRACTFACTORYPATTERNLOGINIT + ::std::ifstream in(filePath); + if (!in) + throw ::std::exception((::std::string("Cannot open file (read mode): ") + filePath).data()); + ::std::string line; + while (::std::getline(in, line)) + { + ::boost::algorithm::trim(line); + if (line.size() > 1 && line[0] != '#') // skip comment lines beginning with "#" + { +#ifdef WIN32 + ::std::string const path = line + ".dll"; +#else +#error no implementation for non-win32 systems +#endif +#ifdef WIN32 + if (NULL != /*win32*/::LoadLibrary(path.c_str())) +#else +#error no implementation for non-win32 systems +#endif + { + ::std::cout << L"Plugin successfully loaded: " << path << ::std::endl; + LDEBUG << "Plugin successfully loaded: " << path; + } + else + { +#ifdef WIN32 + auto const msgId = ::GetLastError(); +#else +#error no implementation for non-win32 systems +#endif + ::std::cout << L"Plugin loading failed: " << line << " : (err windows " << msgId << ") " << SystemMsg(msgId).toUtf8String() << ::std::endl; + LDEBUG << L"Plugin loading failed: " << line << " : (err windows " << msgId << ") " << SystemMsg(msgId).toUtf8String(); + } + } + } +} + +}} diff --git a/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h new file mode 100644 index 000000000..e9e4f76f9 --- /dev/null +++ b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h @@ -0,0 +1,15 @@ + +#pragma once + +#include "common/AbstractFactoryPattern/AbstractFactoryPatternExport.h" + +namespace Lima { namespace antinno { + +class LIMA_FACTORY_EXPORT LibraryLoader +{ +public: + LibraryLoader::LibraryLoader(); + void loadFromFile(::std::string const& filePath); +}; + +}} \ No newline at end of file diff --git a/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h b/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h index bee4cfccb..36cfd839b 100644 --- a/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h +++ b/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h @@ -32,7 +32,12 @@ namespace Lima class AbstractProcessingClient { public: - +#ifdef ANTINNO_SPECIFIC + // FWI 13/03/2015 : jout 3 mthodes sinon erreur de link + AbstractProcessingClient() {} + AbstractProcessingClient(AbstractProcessingClient const&) {} + AbstractProcessingClient& operator=(AbstractProcessingClient const&) { return *this; } +#endif //! @brief Define the destructor virtual to ensure concrete client destructors to be called virtual ~AbstractProcessingClient() {} @@ -47,7 +52,12 @@ class AbstractProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), + Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const = 0; +#else const std::set& inactiveUnits = std::set()) const = 0; +#endif }; @@ -91,7 +101,7 @@ class AbstractProcessingClientFactory /** * This function create a LinguisticProcessing client */ - virtual AbstractProcessingClient* createClient() const = 0; + virtual std::shared_ptr< AbstractProcessingClient > createClient() const = 0; /** * virtual destructor of the LinguisticProcessing client factory diff --git a/lima_common/src/common/Data/DataTypes.cpp b/lima_common/src/common/Data/DataTypes.cpp index afb7a1bea..417055c72 100644 --- a/lima_common/src/common/Data/DataTypes.cpp +++ b/lima_common/src/common/Data/DataTypes.cpp @@ -409,7 +409,7 @@ std::ostream& operator<<(ostream& os, const Node& node) QDebug& operator<<(QDebug& os, const Node& node) { - os<<"Node "<* nodes=structure.getNodes(); - os << "Structure: " << structure.getStructId() << " ; nodes ("<size()<<"): "; + os << "Structure( structId:" << structure.getStructId() << ", nodes ("<size()<<"): "; for (map::const_iterator ItrNodes = nodes->begin(); ItrNodes != nodes->end() ; ItrNodes++) { - os<<"node ("<first<<"):" << ItrNodes->second; + os<<"node ("<first<<":" << ItrNodes->second<<")"; } return os; } diff --git a/lima_common/src/common/Data/LimaString.cpp b/lima_common/src/common/Data/LimaString.cpp index a786126ab..89b2d1720 100644 --- a/lima_common/src/common/Data/LimaString.cpp +++ b/lima_common/src/common/Data/LimaString.cpp @@ -19,23 +19,27 @@ /** * @file LimaString.cpp * @date Created on : Thu Oct 9, 2003 - * @author Gael de Chalendar - + * @author Gael de Chalendar \n * Copyright (c) 2003-2012 by CEA LIST * @version $Id$ */ #include "LimaString.h" - namespace Lima { - +#ifdef ANTINNO_SPECIFIC +// FWI 19/05/2016 : supprimé car défini dans LimaCommon.h +// std::ostream& operator<<(std::ostream &os, const LimaString& s) +// { +// os << s.toUtf8().data(); +// return os; +// } +#else std::ostream& operator<<(std::ostream &os, const LimaString& s) { os << s.toUtf8().data(); return os; } - - +#endif } // closing namespace Lima diff --git a/lima_common/src/common/Data/LimaString.h b/lima_common/src/common/Data/LimaString.h index 7280d6f36..d56cec97b 100644 --- a/lima_common/src/common/Data/LimaString.h +++ b/lima_common/src/common/Data/LimaString.h @@ -1,4 +1,4 @@ -/* +/* Copyright 2002-2013 CEA LIST This file is part of LIMA. @@ -37,7 +37,12 @@ namespace Lima typedef QChar LimaChar; typedef QString LimaString; -LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#ifdef ANTINNO_SPECIFIC + // FWI 19/05/2016 : supprimé car défini dans LimaCommon.h + //LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#else + LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#endif } // closing namespace Lima diff --git a/lima_common/src/common/Data/genericDocumentProperties.cpp b/lima_common/src/common/Data/genericDocumentProperties.cpp index 069b695c2..d1e4a75bf 100644 --- a/lima_common/src/common/Data/genericDocumentProperties.cpp +++ b/lima_common/src/common/Data/genericDocumentProperties.cpp @@ -334,47 +334,62 @@ void GenericDocumentProperties::read(std::istream& file) { m_d->m_multipleStringValues.clear(); m_d->m_multipleWeightedPropValues.clear(); -// BOWLOGINIT; - +#ifdef DEBUG_CD + BOWLOGINIT; +#endif // read integer properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_intValues.insert(std::pair(name,val)); } // read string properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_stringValues.insert(std::pair(name,str) ); } // read date properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); ireadDate(file); string strDate=d.toString().toUtf8().data(); -// LDEBUG << "read date " << strDate.c_str() << " as value of " << name.c_str(); +#ifdef DEBUG_CD + LDEBUG << "read date " << strDate.c_str() << " as value of " << name.c_str(); +#endif m_d->m_dateValues.insert(std::pair(name,d)); } // read date interval properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); ireadDate(file); string strStartDate=startD.toString().toUtf8().data(); string strEndDate=endD.toString().toUtf8().data(); -// LDEBUG << "read interval [" << strStartDate.c_str() << "," << strEndDate.c_str() << " as value of " << name.c_str(); +#ifdef DEBUG_CD + LDEBUG << "read interval [" << strStartDate.c_str() << "," << strEndDate.c_str() << " as value of " << name.c_str(); +#endif std::pair interval(startD,endD); m_d->m_dateIntervalValues.insert(std::pair >(name,interval)); } // read multi-valued string properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_multipleStringValues.insert(std::pair >(name,val) ); diff --git a/lima_common/src/common/Data/tests/FileUtilsTest.cpp b/lima_common/src/common/Data/tests/FileUtilsTest.cpp index a8c72bd0c..ec599c1a0 100644 --- a/lima_common/src/common/Data/tests/FileUtilsTest.cpp +++ b/lima_common/src/common/Data/tests/FileUtilsTest.cpp @@ -18,7 +18,7 @@ */ #include "FileUtilsTest.h" -#include "common/Data/FileUtils.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLogCategories.h" #include diff --git a/lima_common/src/common/FsaAccess/CompoundStringAccess.h b/lima_common/src/common/FsaAccess/CompoundStringAccess.h index 27c83a41c..5fd26b73c 100644 --- a/lima_common/src/common/FsaAccess/CompoundStringAccess.h +++ b/lima_common/src/common/FsaAccess/CompoundStringAccess.h @@ -114,10 +114,6 @@ CompoundStringAccess::CompoundStringAccess( bool trie_dire template CompoundStringAccess::~CompoundStringAccess() { -#ifdef DEBUG_CD - COMPSTRACCESSLOGINIT; - LDEBUG << "CompoundStringAccess::~CompoundStringAccess()"; -#endif } template diff --git a/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h b/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h index 676aa996f..1c52960be 100644 --- a/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h +++ b/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h @@ -26,7 +26,7 @@ #ifndef FSA_IO_HANDLER_HPP #define FSA_IO_HANDLER_HPP -#include +#include #include namespace Lima { diff --git a/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp b/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp index 09638eefa..802331c3d 100644 --- a/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp +++ b/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp @@ -56,10 +56,6 @@ FsaAccessSpare16::FsaAccessSpare16(bool trie_direction_fwd) FsaAccessSpare16::~FsaAccessSpare16() { -#ifdef DEBUG_CD - FSAALOGINIT; - LDEBUG << "FsaAccessSpare16::~FsaAccessSpare16()"; -#endif } FsaAccessIOHandler* diff --git a/lima_common/src/common/Handler/AbstractDocumentHandler.h b/lima_common/src/common/Handler/AbstractDocumentHandler.h index 6dca001e7..39a913dad 100644 --- a/lima_common/src/common/Handler/AbstractDocumentHandler.h +++ b/lima_common/src/common/Handler/AbstractDocumentHandler.h @@ -45,17 +45,17 @@ class AbstractDocumentHandler //! @brief destructor virtual ~AbstractDocumentHandler(){}; - virtual void writeDocumentsHeader(){}; - virtual void writeDocumentsFooter(){}; - - virtual void openSNode(const Lima::Common::Misc::GenericDocumentProperties* properties, - const std::string& elementName) = 0; - virtual void openSIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, - const std::string& elementName) = 0; - virtual void processProperties(const Lima::Common::Misc::GenericDocumentProperties* properties, bool useIterators) = 0; - virtual void closeSNode() = 0; - virtual void processSContent( const Lima::Common::Misc::GenericDocumentProperties* /*properties*/ ){}; - virtual void closeSContent(){}; + virtual void writeDocumentsHeader(){}; + virtual void writeDocumentsFooter(){}; + + virtual void openSNode(const Lima::Common::Misc::GenericDocumentProperties* properties, + const std::string& elementName) = 0; + virtual void openSIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, + const std::string& elementName) = 0; + virtual void processProperties(const Lima::Common::Misc::GenericDocumentProperties* properties, bool useIterator, bool useIndexIterator) = 0; + virtual void closeSNode() = 0; + virtual void processSContent( const Lima::Common::Misc::GenericDocumentProperties* /*properties*/ ){}; + virtual void closeSContent(){}; }; } // namespace Lima diff --git a/lima_common/src/common/Handler/AbstractProcessingClientHandler.h b/lima_common/src/common/Handler/AbstractProcessingClientHandler.h index bac840b08..2dfaa9ed5 100644 --- a/lima_common/src/common/Handler/AbstractProcessingClientHandler.h +++ b/lima_common/src/common/Handler/AbstractProcessingClientHandler.h @@ -31,7 +31,7 @@ class AbstractProcessingClientHandler public: virtual ~AbstractProcessingClientHandler() {} - inline virtual void setAnalysisClient(const std::string& clientId, AbstractProcessingClient* client) + inline virtual void setAnalysisClient(const std::string& clientId, std::shared_ptr< AbstractProcessingClient > client) { if (m_clients.find(clientId)!=m_clients.end()) { @@ -43,7 +43,7 @@ class AbstractProcessingClientHandler m_clients.insert(std::make_pair(clientId, client)); } - inline virtual AbstractProcessingClient* getAnalysisClient(const std::string& clientId) + inline virtual std::shared_ptr< AbstractProcessingClient > getAnalysisClient(const std::string& clientId) { if (m_clients.find(clientId)==m_clients.end()) { @@ -56,19 +56,28 @@ class AbstractProcessingClientHandler return m_clients[clientId]; } - inline virtual std::map getAnalysisClients() const {return m_clients;}; - inline virtual void setAnalysisClients(std::map clients){m_clients=clients;}; + inline virtual std::map > getAnalysisClients() const {return m_clients;}; + inline virtual void setAnalysisClients(std::map > clients){m_clients=clients;}; virtual void handleProc( const std::string& tagName, const std::string& content, const std::map& metaData, const std::string& pipeline, const std::map& handlers = std::map(), - const std::set& inactiveUnits = std::set()) + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + , Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) { ABSTRACTPROCESSINGCLIENTLOGINIT; - LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits); +#ifdef ANTINNO_SPECIFIC + LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits, stopAnalyze); +#else + LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits); +#endif } // inline virtual void setAnalysisHandler(const std::string& handlerId, AbstractAnalysisHandler* handler) @@ -83,9 +92,9 @@ class AbstractProcessingClientHandler private: //! @brief list of handlers available - std::map m_clients; + std::map > m_clients; }; } -#endif +#endif \ No newline at end of file diff --git a/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h b/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h index d821fcadf..acc147623 100644 --- a/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h +++ b/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h @@ -68,4 +68,4 @@ class AbstractXmlAnalysisHandler : public AbstractAnalysisHandler } // Lima -#endif +#endif \ No newline at end of file diff --git a/lima_common/src/common/LimaCommon.cpp b/lima_common/src/common/LimaCommon.cpp index d48df0c89..70cc933f5 100644 --- a/lima_common/src/common/LimaCommon.cpp +++ b/lima_common/src/common/LimaCommon.cpp @@ -1,45 +1,85 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -#include "common/LimaCommon.h" - -#ifdef WIN32 - -#ifdef LIMA_COMMON_EXPORTING -#define LIMA_COMMON_EXPORT __declspec(dllexport) -#else -#define LIMA_COMMON_EXPORT __declspec(dllimport) -#endif - - -#else // Not WIN32 - -#define LIMA_COMMON_EXPORT - -#endif - - -namespace Lima -{ -namespace Common -{ - -LIMA_COMMON_EXPORT void fakeSymbolFoWindowsLinking() {} - -} -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#include "common/LimaCommon.h" + +/* FWI 22/02/2016 dplac dans le .h +#ifdef WIN32 + +#ifdef LIMA_COMMON_EXPORTING +#define LIMA_COMMON_EXPORT __declspec(dllexport) +#else +#define LIMA_COMMON_EXPORT __declspec(dllimport) +#endif + + +#else // Not WIN32 + +#define LIMA_COMMON_EXPORT + +#endif +*/ + +#include +#include + +#ifdef ANTINNO_SPECIFIC +namespace Lima +{ +#ifdef _DEBUG +StopAnalyze::StopAnalyze(bool v) : _v(v) +{ +} +StopAnalyze::StopAnalyze(StopAnalyze const& o) : _v(o._v) +{ +} +StopAnalyze::operator bool() const +{ + return _v; +} +StopAnalyze& StopAnalyze::operator=(StopAnalyze const& o) +{ + _v = o._v; + return *this; +} +bool StopAnalyze::operator==(StopAnalyze const& o) +{ + return _v == o._v; +} +bool StopAnalyze::operator!=(StopAnalyze const& o) +{ + return _v != o._v; +} +#else +// nothing +#endif + +StopAnalyze defaultStopAnalyze(false); + +} +#endif + +namespace Lima +{ +namespace Common +{ + +LIMA_COMMON_EXPORT void fakeSymbolFoWindowsLinking() {} + +} +} diff --git a/lima_common/src/common/LimaCommon.h b/lima_common/src/common/LimaCommon.h index a22616c86..b3b0ee0b2 100644 --- a/lima_common/src/common/LimaCommon.h +++ b/lima_common/src/common/LimaCommon.h @@ -1,365 +1,486 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/************************************************************************ - * - * @file LimaCommon.h (from s2Common.h) - * @author Gael de Chalendar - - * Benoit Mathieu - - * Hervé Le Borgne - - * @date mar déc 18 2007 - * copyright Copyright (C) 2003-2012 by CEA LIST - * Project mm_common - * - * @brief (short description) - * - ***********************************************************************/ -#ifndef LIMA_MMCOMMONS_H -#define LIMA_MMCOMMONS_H - -#include -#include - -#ifdef WIN32 - -#pragma warning( disable : 4512 ) - -// Avoids compilation errors redefining struc sockaddr in ws2def.h -#define _WINSOCKAPI_ - -#undef min -#undef max -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#endif - - -#include - -#ifdef WIN32 - - -#ifdef LIMA_DATA_EXPORTING - #define LIMA_DATA_EXPORT __declspec(dllexport) -#else - #define LIMA_DATA_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_DATAHANDLER_EXPORTING - #define LIMA_DATAHANDLER_EXPORT __declspec(dllexport) -#else - #define LIMA_DATAHANDLER_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_FSAACCESS_EXPORTING - #define LIMA_FSAACCESS_EXPORT __declspec(dllexport) -#else - #define LIMA_FSAACCESS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_MEDIAPROCESSORS_EXPORTING - #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllexport) -#else - #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_MEDIATICDATA_EXPORTING - #define LIMA_MEDIATICDATA_EXPORT __declspec(dllexport) -#else - #define LIMA_MEDIATICDATA_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_COMMONMISC_EXPORTING - #define LIMA_COMMONMISC_EXPORT __declspec(dllexport) -#else - #define LIMA_COMMONMISC_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_COMMONTOOLS_EXPORTING - #define LIMA_COMMONTOOLS_EXPORT __declspec(dllexport) -#else - #define LIMA_COMMONTOOLS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_PROCESSUNITFRAMEWORK_EXPORTING - #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllexport) -#else - #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_TIME_EXPORTING - #define LIMA_TIME_EXPORT __declspec(dllexport) -#else - #define LIMA_TIME_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_XMLCONFIGURATIONFILES_EXPORTING - #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllexport) -#else - #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllimport) -#endif - -#else // Not WIN32 - -#define LIMA_DATA_EXPORT -#define LIMA_DATAHANDLER_EXPORT -#define LIMA_FSAACCESS_EXPORT -#define LIMA_MEDIAPROCESSORS_EXPORT -#define LIMA_MEDIATICDATA_EXPORT -#define LIMA_COMMONMISC_EXPORT -#define LIMA_COMMONTOOLS_EXPORT -#define LIMA_PROCESSUNITFRAMEWORK_EXPORT -#define LIMA_TIME_EXPORT -#define LIMA_XMLCONFIGURATIONFILES_EXPORT - -#endif - -#include -#include - -#ifndef LIMA_DEBUG -#define LIMA_DEBUG 0 -#endif - -// standard include -#include - -#include -#include -#include "common/QsLog/QsLogDest.h" - -#define LTRACE QLOG_TRACE() -#define LDEBUG QLOG_DEBUG() -#define LINFO QLOG_INFO() -#define LNOTICE QLOG_INFO() -#define LWARN QLOG_WARN() -#define LERROR QLOG_ERROR() -#define LFATAL QLOG_FATAL() - -// #define LOGINIT(X) QsLogging::Logger& logger = QsLogging::Logger::instance(X); -// logger.setLoggingLevel( QsLogging::Categories::instance().levelFor( X ) ); - -class LogInit +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file LimaCommon.h (from s2Common.h) + * @author Gael de Chalendar + + * Benoit Mathieu + + * Hervé Le Borgne + + * @date mar déc 18 2007 + * copyright Copyright (C) 2003-2012 by CEA LIST + * Project mm_common + * + * @brief (short description) + * + ***********************************************************************/ +#ifndef LIMA_MMCOMMONS_H +#define LIMA_MMCOMMONS_H + +#include +#include + +#ifdef WIN32 + +#pragma warning( disable : 4512 ) + +// Avoids compilation errors redefining struc sockaddr in ws2def.h +#define _WINSOCKAPI_ + +#undef min +#undef max +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#endif + + +#include + +#ifdef WIN32 + + +#ifdef LIMA_DATA_EXPORTING + #define LIMA_DATA_EXPORT __declspec(dllexport) +#else + #define LIMA_DATA_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_DATAHANDLER_EXPORTING + #define LIMA_DATAHANDLER_EXPORT __declspec(dllexport) +#else + #define LIMA_DATAHANDLER_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_FSAACCESS_EXPORTING + #define LIMA_FSAACCESS_EXPORT __declspec(dllexport) +#else + #define LIMA_FSAACCESS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_MEDIAPROCESSORS_EXPORTING + #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllexport) +#else + #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_MEDIATICDATA_EXPORTING + #define LIMA_MEDIATICDATA_EXPORT __declspec(dllexport) +#else + #define LIMA_MEDIATICDATA_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_COMMONMISC_EXPORTING + #define LIMA_COMMONMISC_EXPORT __declspec(dllexport) +#else + #define LIMA_COMMONMISC_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_COMMONTOOLS_EXPORTING + #define LIMA_COMMONTOOLS_EXPORT __declspec(dllexport) +#else + #define LIMA_COMMONTOOLS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_PROCESSUNITFRAMEWORK_EXPORTING + #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllexport) +#else + #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_TIME_EXPORTING + #define LIMA_TIME_EXPORT __declspec(dllexport) +#else + #define LIMA_TIME_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_XMLCONFIGURATIONFILES_EXPORTING + #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllexport) +#else + #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllimport) +#endif + +#else // Not WIN32 + +#define LIMA_DATA_EXPORT +#define LIMA_DATAHANDLER_EXPORT +#define LIMA_FSAACCESS_EXPORT +#define LIMA_MEDIAPROCESSORS_EXPORT +#define LIMA_MEDIATICDATA_EXPORT +#define LIMA_COMMONMISC_EXPORT +#define LIMA_COMMONTOOLS_EXPORT +#define LIMA_PROCESSUNITFRAMEWORK_EXPORT +#define LIMA_TIME_EXPORT +#define LIMA_XMLCONFIGURATIONFILES_EXPORT + +#endif + +#include +#include + +#ifndef LIMA_DEBUG +#define LIMA_DEBUG 0 +#endif + +// standard include +#include + +#include +#include +#include "common/QsLog/QsLogDest.h" +#ifdef ANTINNO_SPECIFIC +// FWI 19/05/2016 ajout 2 includes +#include +#include + +#ifdef WIN32 + +#ifdef LIMA_COMMON_EXPORTING +#define LIMA_COMMON_EXPORT __declspec(dllexport) +#else +#define LIMA_COMMON_EXPORT __declspec(dllimport) +#endif + + +#else // Not WIN32 + +#define LIMA_COMMON_EXPORT + +#endif +namespace Lima +{ +#ifdef _DEBUG + class LIMA_COMMON_EXPORT StopAnalyze + { + bool _v; + public: + StopAnalyze(bool v); + StopAnalyze(StopAnalyze const&); + operator bool() const; + StopAnalyze& operator=(StopAnalyze const& o); + bool operator==(StopAnalyze const& o); + bool operator!=(StopAnalyze const& o); + }; +#else + typedef bool LIMA_COMMON_EXPORT StopAnalyze; +#endif + extern LIMA_COMMON_EXPORT StopAnalyze defaultStopAnalyze; +} + +#define LTRACE \ + if ( logger.loggingLevel() <= QsLogging::TraceLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::TraceLevel, logger.zone()).stream() +#define LDEBUG \ + if ( logger.loggingLevel() <= QsLogging::DebugLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::DebugLevel, logger.zone()).stream() +#define LINFO \ + if ( logger.loggingLevel() <= QsLogging::InfoLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::InfoLevel, logger.zone()).stream() +#define LNOTICE \ + if ( logger.loggingLevel() <= QsLogging::InfoLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::InfoLevel, logger.zone()).stream() +#define LWARN \ + if ( logger.loggingLevel() <= QsLogging::WarnLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::WarnLevel, logger.zone()).stream() +#define LERROR \ + if ( logger.loggingLevel() <= QsLogging::ErrorLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::ErrorLevel, logger.zone()).stream() +#define LFATAL \ + if ( logger.loggingLevel() <= QsLogging::FatalLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::FatalLevel, logger.zone()).stream() + +#else + +#define LTRACE QLOG_TRACE() +#define LDEBUG QLOG_DEBUG() +#define LINFO QLOG_INFO() +#define LNOTICE QLOG_INFO() +#define LWARN QLOG_WARN() +#define LERROR QLOG_ERROR() +#define LFATAL QLOG_FATAL() + +#endif + +// #define LOGINIT(X) QsLogging::Logger& logger = QsLogging::Logger::instance(X); +// logger.setLoggingLevel( QsLogging::Categories::instance().levelFor( X ) ); + +class LogInit +{ +public: + LogInit(char const* x) + { + // initialisation thread-safe + static QMutex mutex; + QMutexLocker locker(&mutex); + pLogger = &QsLogging::Logger::instance(x); +#ifndef DEBUG_CD + QsLogging::Level level = QsLogging::Categories::instance().levelFor(x); + pLogger->setLoggingLevel(level); +#endif + + } + QsLogging::Logger* pLogger; +}; +#ifndef DEBUG_CD +#define LOGINIT(X) \ + static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ + auto& logger = *(logInit.pLogger); +#else +#define LOGINIT(X) \ + static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ + auto& logger = *(logInit.pLogger); \ + logger.setLoggingLevel(QsLogging::Categories::instance().levelFor( X )); +#endif + +//QsLogging::DestinationPtr debugDestination( QsLogging::DestinationFactory::MakeDebugOutputDestination() ); +//logger.addDestination(debugDestination.get()); +#ifdef ANTINNO_SPECIFIC +// FWI 07/10/2015 ajout pour les logger +static std::ostream& operator<<(std::ostream &os, const QString& s) { -public: - LogInit(char const* x) - { - // initialisation thread-safe - static QMutex mutex; - QMutexLocker locker(&mutex); - pLogger = &QsLogging::Logger::instance(x); - QsLogging::Level level = QsLogging::Categories::instance().levelFor(x); - pLogger->setLoggingLevel(level); - } - QsLogging::Logger* pLogger; -}; -#ifndef DEBUG_CD -#define LOGINIT(X) \ - static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ - auto& logger = *(logInit.pLogger); -#else -#define LOGINIT(X) \ - static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ - auto& logger = *(logInit.pLogger); \ - logger.setLoggingLevel(QsLogging::Categories::instance().levelFor( X )); -#endif - -//QsLogging::DestinationPtr debugDestination( QsLogging::DestinationFactory::MakeDebugOutputDestination() ); -//logger.addDestination(debugDestination.get()); - - -#define LENDL ". Note: LENDL is deprecated. It will be removed from a future release." - -#define ABSTRACTFACTORYPATTERNLOGINIT LOGINIT("Common::AbstractFactoryPattern") -#define ABSTRACTPROCESSINGCLIENTLOGINIT LOGINIT("Common::AbstractProcessingClient") -#define AGLOGINIT LOGINIT("Common::AnnotationGraph") -#define BOWLOGINIT LOGINIT("Common::BOW"); -#define CLIENTFACTORYLOGINIT LOGINIT("Common::ClientFactory") -#define COMPSTRACCESSLOGINIT LOGINIT("Common::CompStrAccess") -#define FSAAHASHLOGINIT LOGINIT("Common::FsaAccessHash") -#define FSAAIOLOGINIT LOGINIT("Common::FsaAccessIO") -#define FSAALOGINIT LOGINIT("Common::FsaAccess") -#define HANDLERLOGINIT LOGINIT("Common::Handler") -#define LDATALOGINIT LOGINIT("Common::LanguageData") -#define MDATALOGINIT LOGINIT("Common::MediaticData") -#define MISCLOGINIT LOGINIT("Common::Misc") -#define PROCESSORSLOGINIT LOGINIT("Common::Processors") -#define PROCESSUNITFRAMEWORKLOGINIT LOGINIT("Common::ProcessUnitFramework") -#define PROPERTYCODELOGINIT LOGINIT("Common::PropertyCode") -#define STRINGMAPLOGINIT LOGINIT("Common::StringMap") -#define STRPOOLLOGINIT LOGINIT("Common::StringPool") -#define TGVLOGINIT LOGINIT("Common::TGV") -#define XMLCFGLOGINIT LOGINIT("Common::XMLConfigurationFiles") -#define DYNAMICLIBMANAGERLOGINIT LOGINIT("Common::DynamicLibrariesManager") - -QDebug& operator<< (QDebug& qd, const std::string& str ); - - -#ifndef LIMA_UNUSED -#define LIMA_UNUSED(x) (void)x; -#endif - -namespace Lima -{ - -enum LimaStatusCode { - SUCCESS_ID, - CANNOT_OPEN_FILE_ERROR, - OUT_OF_RANGE_ERROR, - UNKNOWN_ERROR, - UNSUPPORTED_LANGUAGE, - INVALID_CONFIGURATION, - MISSING_DATA -}; - -BOOST_STRONG_TYPEDEF(uint32_t, LinguisticCode); -BOOST_STRONG_TYPEDEF(char, NoParameters); - -#define UNDEFLANG std::numeric_limits::max() - -BOOST_STRONG_TYPEDEF(uint8_t, MediaId); - -class LimaException : public std::exception -{ -public: - LimaException() : std::exception(),m_reason() {} - LimaException(const std::string& mess) : std::exception(), m_reason(mess) {} - virtual ~LimaException() throw() {} - virtual const char * what () const throw() { - return m_reason.c_str(); - } -protected: - LimaException& operator=(const LimaException&) {return *this;} - const std::string m_reason; -}; - -class InvalidConfiguration : public LimaException -{ - public: - InvalidConfiguration() : LimaException() {}; - InvalidConfiguration(const std::string& mess) : LimaException(mess) {} -private: - InvalidConfiguration& operator=(const InvalidConfiguration&) {return *this;} -}; -class MediaNotInitialized : public LimaException -{ -public : - MediaNotInitialized(MediaId medId) : LimaException(),m_medId(medId),m_med(),m_num(true) {}; - MediaNotInitialized(const std::string& med) : LimaException(),m_medId(0),m_med(med),m_num(false) {}; - virtual ~MediaNotInitialized() throw() {}; - const char* what() const throw() - { - if (m_num) - { - std::ostringstream oo; - oo << "uninitialized media " << (int)m_medId; - return oo.str().c_str(); - } - else - { - return (std::string("uninitialized media ")+m_med).c_str(); - } - }; -private: - MediaNotInitialized& operator=(const MediaNotInitialized&) {return *this;} - MediaId m_medId; - std::string m_med; - bool m_num; -}; - -class LanguageNotInitialized : public LimaException { -public : - LanguageNotInitialized(MediaId langId) : LimaException(),m_langId(langId),m_lang(),m_num(true) {}; - LanguageNotInitialized(const std::string& lang) : LimaException(),m_langId(0),m_lang(lang),m_num(false) {}; - virtual ~LanguageNotInitialized() throw() {}; - const char* what() const throw() { - if (m_num) { - std::ostringstream oo; - oo << "uninitialized language " << (int)m_langId; - return oo.str().c_str(); - } else { - return (std::string("uninitialized language ")+m_lang).c_str(); - } - }; -private: - LanguageNotInitialized& operator=(const LanguageNotInitialized&) {return *this;} - MediaId m_langId; - std::string m_lang; - bool m_num; -}; - -class AccessByStringNotInitialized : public LimaException { -public : - AccessByStringNotInitialized(const std::string& reason) : LimaException(), m_reason(reason) {}; - virtual ~AccessByStringNotInitialized() throw() {}; - const char* what() const throw() { - std::ostringstream oo; - oo << "Fsa not initialized because of " << m_reason; - return oo.str().c_str(); - }; -private: - AccessByStringNotInitialized& operator=(const AccessByStringNotInitialized&) {return *this;} - std::string m_reason; -}; - -class AccessByStringOutOfRange : public LimaException { -public : - AccessByStringOutOfRange(const std::string& reason) : LimaException(), m_reason(reason) {}; - virtual ~AccessByStringOutOfRange() throw() {}; - const char* what() const throw() { - std::ostringstream oo; - oo << "parameter out of range " << m_reason; - return oo.str().c_str(); - }; -private: - AccessByStringOutOfRange& operator=(const AccessByStringOutOfRange&) {return *this;} - std::string m_reason; -}; - -class IncompleteResources : public LimaException { -public : - IncompleteResources(const std::string& reason) : LimaException(), m_reason(reason) {} - virtual ~IncompleteResources() throw() {} - const char* what() const throw() { - return (std::string("incomplete ressources: ") + m_reason).c_str() ; - } -private: - IncompleteResources& operator=(const IncompleteResources&) {return *this;} - std::string m_reason; -}; - -class XMLException : public std::runtime_error -{ -public: - explicit XMLException(const std::string& msg = "") : std::runtime_error(msg) {} - const char* getMessage() const {return this->what();} -private: - XMLException& operator=(const XMLException&) {return *this;} -}; - - -} // closing namespace Lima - -#endif // LIMA_MMCOMMONS_H + os << s.toUtf8().constData(); + return os; +} + +static ::std::ostream& operator<<(::std::ostream& out, QStringList const& o) +{ + bool isFirst = true; + for(auto it=o.constBegin(); it!=o.constEnd(); ++it) + { + out << (isFirst?L"":L",") << *it; + isFirst = false; + } + return out; +} +#endif + + + +#define LENDL ". Note: LENDL is deprecated. It will be removed from a future release." + +#define ABSTRACTFACTORYPATTERNLOGINIT LOGINIT("Common::AbstractFactoryPattern") +#define ABSTRACTPROCESSINGCLIENTLOGINIT LOGINIT("Common::AbstractProcessingClient") +#define AGLOGINIT LOGINIT("Common::AnnotationGraph") +#define BOWLOGINIT LOGINIT("Common::BOW"); +#define CLIENTFACTORYLOGINIT LOGINIT("Common::ClientFactory") +#define COMPSTRACCESSLOGINIT LOGINIT("Common::CompStrAccess") +#define FSAAHASHLOGINIT LOGINIT("Common::FsaAccessHash") +#define FSAAIOLOGINIT LOGINIT("Common::FsaAccessIO") +#define FSAALOGINIT LOGINIT("Common::FsaAccess") +#define HANDLERLOGINIT LOGINIT("Common::Handler") +#define LDATALOGINIT LOGINIT("Common::LanguageData") +#define MDATALOGINIT LOGINIT("Common::MediaticData") +#define MISCLOGINIT LOGINIT("Common::Misc") +#define PROCESSORSLOGINIT LOGINIT("Common::Processors") +#define PROCESSUNITFRAMEWORKLOGINIT LOGINIT("Common::ProcessUnitFramework") +#define PROPERTYCODELOGINIT LOGINIT("Common::PropertyCode") +#define STRINGMAPLOGINIT LOGINIT("Common::StringMap") +#define STRPOOLLOGINIT LOGINIT("Common::StringPool") +#define TGVLOGINIT LOGINIT("Common::TGV") +#define XMLCFGLOGINIT LOGINIT("Common::XMLConfigurationFiles") +#define DYNAMICLIBMANAGERLOGINIT LOGINIT("Common::DynamicLibrariesManager") + +QDebug& operator<< (QDebug& qd, const std::string& str ); + + +#ifndef LIMA_UNUSED +#define LIMA_UNUSED(x) (void)x; +#endif + +namespace Lima +{ + +enum LimaStatusCode { + SUCCESS_ID, + CANNOT_OPEN_FILE_ERROR, + OUT_OF_RANGE_ERROR, + UNKNOWN_ERROR, + UNSUPPORTED_LANGUAGE, + INVALID_CONFIGURATION, + MISSING_DATA +#ifdef ANTINNO_SPECIFIC + // FWI 22/02/2016 ajout TIME_OVERFLOW pour stopAnalyze + ,TIME_OVERFLOW +#endif +}; + +#ifdef ANTINNO_SPECIFIC +BOOST_STRONG_TYPEDEF(unsigned int, ReformulationType) +#endif + + +BOOST_STRONG_TYPEDEF(uint32_t, LinguisticCode); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type LinguisticCode sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::LinguisticCode numeric_limits::max() { return Lima::LinguisticCode(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +BOOST_STRONG_TYPEDEF(char, NoParameters); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type LinguisticCode sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::NoParameters numeric_limits::max() { return Lima::NoParameters(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +#define UNDEFLANG std::numeric_limits::max() + +BOOST_STRONG_TYPEDEF(uint8_t, MediaId); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type MediaId sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::MediaId numeric_limits::max() { return Lima::MediaId(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +class LimaException : public std::exception +{ +public: + LimaException() : std::exception(),m_reason() {} + LimaException(const std::string& mess) : std::exception(), m_reason(mess) {} + virtual ~LimaException() throw() {} + virtual const char * what () const throw() { + return m_reason.c_str(); + } +protected: + LimaException& operator=(const LimaException&) {return *this;} + const std::string m_reason; +}; + +class InvalidConfiguration : public LimaException +{ + public: + InvalidConfiguration() : LimaException() {}; + InvalidConfiguration(const std::string& mess) : LimaException(mess) {} +private: + InvalidConfiguration& operator=(const InvalidConfiguration&) {return *this;} +}; +class MediaNotInitialized : public LimaException +{ +public : + MediaNotInitialized(MediaId medId) : LimaException(),m_medId(medId),m_med(),m_num(true) {}; + MediaNotInitialized(const std::string& med) : LimaException(),m_medId(0),m_med(med),m_num(false) {}; + virtual ~MediaNotInitialized() throw() {}; + const char* what() const throw() + { + if (m_num) + { + std::ostringstream oo; + oo << "uninitialized media " << (int)m_medId; + return oo.str().c_str(); + } + else + { + return (std::string("uninitialized media ")+m_med).c_str(); + } + }; +private: + MediaNotInitialized& operator=(const MediaNotInitialized&) {return *this;} + MediaId m_medId; + std::string m_med; + bool m_num; +}; + +class LanguageNotInitialized : public LimaException { +public : + LanguageNotInitialized(MediaId langId) : LimaException(),m_langId(langId),m_lang(),m_num(true) {}; + LanguageNotInitialized(const std::string& lang) : LimaException(),m_langId(0),m_lang(lang),m_num(false) {}; + virtual ~LanguageNotInitialized() throw() {}; + const char* what() const throw() { + if (m_num) { + std::ostringstream oo; + oo << "uninitialized language " << (int)m_langId; + return oo.str().c_str(); + } else { + return (std::string("uninitialized language ")+m_lang).c_str(); + } + }; +private: + LanguageNotInitialized& operator=(const LanguageNotInitialized&) {return *this;} + MediaId m_langId; + std::string m_lang; + bool m_num; +}; + +class AccessByStringNotInitialized : public LimaException { +public : + AccessByStringNotInitialized(const std::string& reason) : LimaException(), m_reason(reason) {}; + virtual ~AccessByStringNotInitialized() throw() {}; + const char* what() const throw() { + std::ostringstream oo; + oo << "Fsa not initialized because of " << m_reason; + return oo.str().c_str(); + }; +private: + AccessByStringNotInitialized& operator=(const AccessByStringNotInitialized&) {return *this;} + std::string m_reason; +}; + +class AccessByStringOutOfRange : public LimaException { +public : + AccessByStringOutOfRange(const std::string& reason) : LimaException(), m_reason(reason) {}; + virtual ~AccessByStringOutOfRange() throw() {}; + const char* what() const throw() { + std::ostringstream oo; + oo << "parameter out of range " << m_reason; + return oo.str().c_str(); + }; +private: + AccessByStringOutOfRange& operator=(const AccessByStringOutOfRange&) {return *this;} + std::string m_reason; +}; + +class IncompleteResources : public LimaException { +public : + IncompleteResources(const std::string& reason) : LimaException(), m_reason(reason) {} + virtual ~IncompleteResources() throw() {} + const char* what() const throw() { + return (std::string("incomplete ressources: ") + m_reason).c_str() ; + } +private: + IncompleteResources& operator=(const IncompleteResources&) {return *this;} + std::string m_reason; +}; + +class XMLException : public std::runtime_error +{ +public: + explicit XMLException(const std::string& msg = "") : std::runtime_error(msg) {} + const char* getMessage() const {return this->what();} +private: + XMLException& operator=(const XMLException&) {return *this;} +}; + + +} // closing namespace Lima + +#endif // LIMA_MMCOMMONS_H diff --git a/lima_common/src/common/MediaProcessors/MediaProcessors.cpp b/lima_common/src/common/MediaProcessors/MediaProcessors.cpp index e5fcfd051..11ddb9c34 100644 --- a/lima_common/src/common/MediaProcessors/MediaProcessors.cpp +++ b/lima_common/src/common/MediaProcessors/MediaProcessors.cpp @@ -29,6 +29,7 @@ #include "common/XMLConfigurationFiles/moduleConfigurationStructure.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include @@ -70,18 +71,10 @@ MediaProcessors::MediaProcessors(const MediaProcessors& mp) : Singleton::iterator it=m_d->m_pipelineManagers.begin(); it!=m_d->m_pipelineManagers.end(); it++ ) { -#ifdef DEBUG_CD - LDEBUG << "delete " << it->first; -#endif delete it->second; it->second=0; } @@ -180,6 +173,10 @@ void MediaProcessors::initPipelines ( { std::cout << "no pipeline '" << *pipItr << "' for media " << mediaStr << std::endl; // continue; +#ifdef ANTINNO_BUGFIX + // FWI 26/04/2016 : activation du "continue" sinon entryItr->second provoque une erreur dtecte seulement en mode debug + continue; +#endif } const MediaProcessUnit* pu=mapItr->second->getObject ( entryItr->second ); const MediaProcessUnitPipeline* pipeline=static_cast ( pu ); @@ -252,8 +249,7 @@ includeProcessors(Common::XMLConfigurationFiles::ModuleConfigurationStructure& m try { //PROCESSORSLOGINIT; //LDEBUG << "i="<< i; - fileName=Common::MediaticData::MediaticData::single().getConfigPath()+ - "/"+string((*it),0,i); + fileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),string((*it),0,i).c_str()).toUtf8().constData(); //LDEBUG << "filename="<< fileName; moduleName=string((*it),i+1); //LDEBUG << "moduleName="<< moduleName; diff --git a/lima_common/src/common/MediaticData/EntityType.h b/lima_common/src/common/MediaticData/EntityType.h index 8e678be90..94e462ebb 100644 --- a/lima_common/src/common/MediaticData/EntityType.h +++ b/lima_common/src/common/MediaticData/EntityType.h @@ -36,7 +36,7 @@ #include -#include +#include namespace Lima { namespace Common { diff --git a/lima_common/src/common/MediaticData/mediaData.cpp b/lima_common/src/common/MediaticData/mediaData.cpp index 7880859a2..7dad1a72a 100644 --- a/lima_common/src/common/MediaticData/mediaData.cpp +++ b/lima_common/src/common/MediaticData/mediaData.cpp @@ -53,10 +53,10 @@ class MediaDataPrivate //std::list< LinguisticCode > m_sentenceBreakMicros; }; -MediaData::MediaData() : m_d(new MediaDataPrivate()) +MediaData::MediaData() : InitializableObject(), m_d(new MediaDataPrivate()) {} -MediaData::MediaData(const MediaData& md) : m_d(new MediaDataPrivate(*md.m_d)) +MediaData::MediaData(const MediaData& md) : InitializableObject(md), m_d(new MediaDataPrivate(*md.m_d)) {} MediaData::~MediaData() diff --git a/lima_common/src/common/MediaticData/mediaData.h b/lima_common/src/common/MediaticData/mediaData.h index d7dfa2c3c..f796f7933 100644 --- a/lima_common/src/common/MediaticData/mediaData.h +++ b/lima_common/src/common/MediaticData/mediaData.h @@ -31,7 +31,7 @@ #include //uint32_t #endif -#include +#include #include namespace Lima @@ -47,6 +47,15 @@ namespace MediaticData #define MEDIADATA_CLASSID "MediaData" BOOST_STRONG_TYPEDEF(boost::uint32_t, ConceptType); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spcialise max() pour le type ConceptType sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +}}} +namespace std { + template <> Lima::Common::MediaticData::ConceptType numeric_limits::max() + { return Lima::Common::MediaticData::ConceptType(::std::numeric_limits::max()); } +} +namespace Lima { namespace Common { namespace MediaticData{ +#endif class MediaDataPrivate; /** diff --git a/lima_common/src/common/MediaticData/mediaticData.cpp b/lima_common/src/common/MediaticData/mediaticData.cpp index 614fbe11c..235dc7ce4 100644 --- a/lima_common/src/common/MediaticData/mediaticData.cpp +++ b/lima_common/src/common/MediaticData/mediaticData.cpp @@ -32,6 +32,7 @@ #include "common/LimaCommon.h" #include "common/QsLog/QsLog.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/Data/readwritetools.h" #include "common/misc/DoubleAccessObjectToIdMap.h" //#include "common/misc/strwstrtools.h" @@ -51,6 +52,7 @@ #include #include #include +#include using namespace std; @@ -98,7 +100,7 @@ class MediaticDataPrivate std::map< std::string, MediaId > m_mediasIds; std::map< MediaId, std::string > m_mediasSymbol; - std::map< MediaId, std::string > m_mediaDefinitionFiles; + std::map< MediaId, QString > m_mediaDefinitionFiles; std::map< MediaId, MediaData* > m_mediasData; // entity types @@ -203,7 +205,7 @@ void MediaticData::init( // TimeUtils::updateCurrentTime(); MDATALOGINIT; - LINFO << "MediaticData::init " << resourcesPath.c_str() << " " << configPath.c_str() << " " << configFile.c_str(); + LINFO << "MediaticData::init " << resourcesPath << " " << configPath << " " << configFile; //LINFO << "Mediatic data initialization"; m_d->m_resourcesPath=resourcesPath; @@ -211,38 +213,55 @@ void MediaticData::init( m_d->m_configFile=configFile; //LINFO << "initialize XMLParser"; - initXMLParser(); - //LINFO << "parse configuration file: " << configPath << "/" << configFile; - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configPath + "/" + configFile); - - LINFO << "MediaticData::init for "; - for (std::deque< std::string >::const_iterator it = meds.begin(); it != meds.end(); it++) - LINFO << " " << (*it).c_str(); + QStringList configPaths = QString::fromUtf8(configPath.c_str()).split(LIMA_PATH_SEPARATOR); + QStringList configFiles = QString::fromUtf8(configFile.c_str()).split(LIMA_PATH_SEPARATOR); + bool configurationFileFound = false; + Q_FOREACH(QString confPath, configPaths) + { + Q_FOREACH(QString confFile, configFiles) + { + if (QFileInfo(confPath + "/" + confFile).exists()) + { + LDEBUG << "MediaticData::init parse configuration file: " << (confPath + "/" + confFile); + configurationFileFound = true; + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration((confPath + "/" + confFile).toUtf8().constData()); - // initHomoSyntagmaticChainsAndRelationsTypes(*configParser); - LDEBUG << "initialize global parameters"; - m_d->initReleaseStringsPool(configuration); + // initHomoSyntagmaticChainsAndRelationsTypes(*configParser); + LDEBUG << "MediaticData::init initialize global parameters"; + m_d->initReleaseStringsPool(configuration); - initEntityTypes(configuration); + initEntityTypes(configuration); - m_d->initRelations(configuration); - - m_d->initConceptTypes(configuration); - - /** - * initialize active medias - */ - - m_d->initMedias(configuration, meds); - - m_d->m_mediasData.clear(); - for (map::const_iterator it=m_d->m_mediasIds.begin(); - it!=m_d->m_mediasIds.end(); - it++) + m_d->initRelations(configuration); + + m_d->initConceptTypes(configuration); + + /** + * initialize active medias + */ + LINFO << "!!! MediaticData::init for "; + for (std::deque< std::string >::const_iterator it = meds.begin(); it != meds.end(); it++) + LINFO << " " << (*it).c_str(); + + m_d->initMedias(configuration, meds); + + m_d->m_mediasData.clear(); + for (map::const_iterator it=m_d->m_mediasIds.begin(); + it!=m_d->m_mediasIds.end(); + it++) + { + initMediaData(it->second); + } + } + if (configurationFileFound) break; + } + if (configurationFileFound) break; + } + if (!configurationFileFound) { - initMediaData(it->second); + MDATALOGINIT; + LERROR << "No configuration file has been found with" << configPath << "and" << configFile; } - //LINFO << "Mediatic data initialization finished"; // TimeUtils::logElapsedTime("MediaticDataInit"); } @@ -265,7 +284,7 @@ void MediaticData::initMedia(const std::string& media) LINFO << "MediaticData::initMedia" << media; //LINFO << "parse configuration file: " << configPath << "/" << configFile; - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(m_d->m_configPath + "/" + m_d->m_configFile); + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(Common::Misc::findFileInPaths(m_d->m_configPath.c_str(), m_d->m_configFile.c_str()).toUtf8().constData()); Lima::Common::MediaticData::MediaticData::changeable().initEntityTypes(configuration); std::deque< std::string > meds; @@ -349,13 +368,6 @@ MediaData& MediaticData::mediaData(MediaId media) return *(it->second); } -void MediaticData::initXMLParser() -{ -// MDATALOGINIT; - //LINFO << "XMLParser initialization"; - -} - void MediaticDataPrivate::initMedias( XMLConfigurationFileParser& configParser, const std::deque< std::string >& meds) @@ -402,9 +414,32 @@ void MediaticDataPrivate::initMedias( m_mediasIds[*it]=id; m_mediasSymbol[id]=*it; - string deffile=configParser.getModuleGroupParamValue("common","mediaDefinitionFiles",*it); - m_mediaDefinitionFiles[id]= m_configPath+"/"+deffile; - + QString deffile= QString::fromUtf8(configParser.getModuleGroupParamValue("common","mediaDefinitionFiles",*it).c_str()); + QStringList configPaths = QString::fromUtf8(m_configPath.c_str()).split(LIMA_PATH_SEPARATOR); + bool mediaDefinitionFileFound = false; +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(const QString& confPath, configPaths) +#else + for(const QString& confPath: configPaths) +#endif + { + if (QFileInfo(confPath + "/" + deffile).exists()) + { + m_mediaDefinitionFiles[id] = (confPath+"/"+deffile); +#ifdef DEBUG_CD + LDEBUG << "media definition file for id" << id << "is" << m_mediaDefinitionFiles[id]; +#endif + mediaDefinitionFileFound = true; + break; + } + } + if (!mediaDefinitionFileFound) + { + MDATALOGINIT; + LERROR << "No media definition file'"<::const_iterator it=m_d->m_mediaDefinitionFiles.find(med); + auto it=m_d->m_mediaDefinitionFiles.find(med); if (it==m_d->m_mediaDefinitionFiles.end()) { MDATALOGINIT; @@ -431,9 +466,9 @@ void MediaticData::initMediaData(MediaId med) throw InvalidConfiguration(); } #ifdef DEBUG_CD - LDEBUG << "MediaticData::initMediaData Parse MediaConfigurationFile " << (it->second).c_str(); + LDEBUG << "MediaticData::initMediaData Parse MediaConfigurationFile " << (it->second); #endif - XMLConfigurationFileParser parser(it->second); + XMLConfigurationFileParser parser((it->second).toUtf8().constData()); #ifdef DEBUG_CD LDEBUG << "MediaticData::initMediaData Class: " << parser.getModuleGroupParamValue("MediaData","Class","class").c_str(); @@ -486,8 +521,8 @@ void MediaticDataPrivate::initRelations( { #ifdef DEBUG_CD MDATALOGINIT; + LDEBUG << "MediaticDataPrivate::initRelations"; #endif - //LINFO << "intialize Relations"; m_relTypes[s_undefinedRelation]=0; m_relTypesNum[0]=s_undefinedRelation; @@ -496,14 +531,14 @@ void MediaticDataPrivate::initRelations( for (map::const_iterator it=rels.begin(); it!=rels.end(); it++) - { - uint8_t relId=atoi(it->second.c_str()); + { + uint8_t relId=atoi(it->second.c_str()); #ifdef DEBUG_CD - LDEBUG << "read relation " << it->first.c_str() << " -> " << (int)relId; + LDEBUG << "read relation " << it->first.c_str() << " -> " << (int)relId; #endif - m_relTypes[it->first]=relId; - m_relTypesNum[relId]=it->first; - } + m_relTypes[it->first]=relId; + m_relTypesNum[relId]=it->first; + } } catch (NoSuchGroup& ) { MDATALOGINIT; @@ -521,8 +556,8 @@ void MediaticDataPrivate::initConceptTypes( { #ifdef DEBUG_CD MDATALOGINIT; + LDEBUG << "MediaticDataPrivate::initConceptTypes"; #endif - //LINFO << "intialize Concepts Types"; try { const map& mapping=configParser.getModuleConfiguration("common").getGroupNamed("SemanticData").getMapAtKey("conceptTypes"); @@ -653,10 +688,10 @@ void MediaticData::initEntityTypes(XMLConfigurationFileParser& configParser) LimaString groupName=Common::Misc::utf8stdstring2limastring((*it).first); - if (groupName=="include") { + if (groupName=="include") + { deque includeList=moduleConf.getListValuesAtKeyOfGroupNamed("includeList","include"); string::size_type i; - string fileName(""); string moduleName(""); for (std::size_t k=0; km_configPath.c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + if (QFileInfo(confPath + "/" + string(includeList[k],0,i).c_str()).exists()) + { + + std::string fileName= (confPath + "/" + string(includeList[k],0,i).c_str()).toUtf8().constData(); - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig2(fileName); - Common::MediaticData::MediaticData::changeable().initEntityTypes(lpconfig2); + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig2(fileName); + Common::MediaticData::MediaticData::changeable().initEntityTypes(lpconfig2); + break; + } + } } - } else { + } + else + { EntityGroupId groupId=addEntityGroup(groupName); #ifdef DEBUG_CD LDEBUG << "initEntityTypes: id is " << groupId; @@ -1019,6 +1064,18 @@ MediaticDataPrivate::~MediaticDataPrivate() { delete it->second; } +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(auto entityType, m_entityTypes) +#else + for (auto entityType: m_entityTypes) +#endif + { + delete entityType; + } + for (auto it = m_stringsPool.begin(); it != m_stringsPool.end(); it++) + { + delete it->second; + } } const LimaString& MediaticData::getEntityTypeNameSeparator() const diff --git a/lima_common/src/common/MediaticData/mediaticData.h b/lima_common/src/common/MediaticData/mediaticData.h index fb755b50b..33f74d47b 100644 --- a/lima_common/src/common/MediaticData/mediaticData.h +++ b/lima_common/src/common/MediaticData/mediaticData.h @@ -86,8 +86,6 @@ class LIMA_MEDIATICDATA_EXPORT MediaticData : public Singleton void initMediaData(MediaId med); - void initXMLParser(); - const FsaStringsPool& stringsPool(MediaId med) const; FsaStringsPool& stringsPool(MediaId med); diff --git a/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp b/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp new file mode 100644 index 000000000..7abc26134 --- /dev/null +++ b/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp @@ -0,0 +1,112 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE MediaticData +#include + +#include "common/MediaticData/mediaticData.h" + +#include "common/time/traceUtils.h" +#include "common/QsLog/QsLog.h" +#include "common/QsLog/QsLogDest.h" +#include "common/QsLog/QsLogCategories.h" +#include "common/QsLog/QsDebugOutput.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" + +using namespace Lima; + +// conversion functions +BOOST_AUTO_TEST_CASE( MediaticDataTest ) +{ + QsLogging::initQsLog(); + Lima::AmosePluginsManager::single(); + + std::string resourcesPath; + std::string configDir; + std::string commonConfigFile("lima-common.xml"); + std::deque langs; + langs.push_front("fre"); + + resourcesPath = std::string (qgetenv("LIMA_RESOURCES").constData()==0?"":qgetenv("LIMA_RESOURCES").constData()); + if (resourcesPath.empty()) + { + resourcesPath = "/usr/share/apps/lima/resources/"; + } + std::cerr << "MediaticData0: resourcesPath=" << resourcesPath << std::endl; + + configDir = std::string (qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData()); + if (configDir.empty()) + { + configDir = "/usr/share/config/lima"; + } + std::cerr << "MediaticData0: configDir=" << configDir << std::endl; + + // initialize common + Common::MediaticData::MediaticData::changeable().init( + resourcesPath, + configDir, + commonConfigFile, + langs); + + // use setter: Create LinguisticProcessing.IDIOM and LinguisticProcessing.SYNTACTIC_RELATION + LimaString groupName1("LinguisticProcessing"); + Common::MediaticData::EntityGroupId group1 = Common::MediaticData::MediaticData::changeable().addEntityGroup(groupName1); + LimaString entityName11("IDIOM"); + LimaString entityName12("SYNTACTIC_RELATION"); + Common::MediaticData::EntityType type11 = Common::MediaticData::MediaticData::changeable().addEntity(groupName1,entityName11); + Common::MediaticData::EntityType type12 = Common::MediaticData::MediaticData::changeable().addEntity(groupName1,entityName12); + + // use setter: Create Location.CITYand Location.COUNTRY + LimaString groupName2("Location"); + Common::MediaticData::EntityGroupId group2 = Common::MediaticData::MediaticData::changeable().addEntityGroup(groupName2); + LimaString entityName21("CITY"); + LimaString entityName22("COUNTRY"); + Common::MediaticData::EntityType type21 = Common::MediaticData::MediaticData::changeable().addEntity(groupName2,entityName21); + Common::MediaticData::EntityType type22 = Common::MediaticData::MediaticData::changeable().addEntity(groupName2,entityName22); + + // test getter: get groupId from name + Lima::Common::MediaticData::EntityGroupId groupId2 = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName2); + std::cerr << "groupName2 = " << groupName1 << ", groupId2 = " << groupId2 << std::endl; + BOOST_REQUIRE( groupId2 == group2 ); + // test getter: get groupName from groupId + LimaString groupName22 = Common::MediaticData::MediaticData::single().getEntityGroupName(groupId2); + BOOST_REQUIRE( groupName2 == groupName22); + + // test getter: get groupId from name + Lima::Common::MediaticData::EntityGroupId groupId1 = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName1); + std::cerr << "groupName1 = " << groupName1 << ", groupId1 = " << groupId1 << std::endl; + BOOST_REQUIRE( groupId1 == group1 ); + // test getter: get groupName from groupId + BOOST_REQUIRE( groupName1 == Common::MediaticData::MediaticData::single().getEntityGroupName(groupId1)); + + // test getter: get entity name from entity + LimaString name11 = Common::MediaticData::MediaticData::single().getEntityName(type11); + LimaString qualifiedEntityName11("LinguisticProcessing.IDIOM"); + std::cerr << "name11 = " << name11 << std::endl; + BOOST_REQUIRE(name11==qualifiedEntityName11); + + // test getter: get entity type from name + LimaString qualifiedEntityName21("Location.CITY"); + LimaString simpleEntityName21("CITY"); + Lima::Common::MediaticData::EntityType entityType211 = Common::MediaticData::MediaticData::single().getEntityType(qualifiedEntityName21); + BOOST_REQUIRE(entityType211 == type21); + Lima::Common::MediaticData::EntityType entityType212 = Common::MediaticData::MediaticData::single().getEntityType(groupId2, simpleEntityName21); + BOOST_REQUIRE(entityType211 == type21); + + } diff --git a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp index fcfff1aca..aac674a08 100644 --- a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp +++ b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp @@ -24,9 +24,15 @@ using namespace std; namespace Lima { - -AnalysisContent::AnalysisContent() : +AnalysisContent::AnalysisContent( +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& sa +#endif + ) : m_analysisData() +#ifdef ANTINNO_SPECIFIC + , _stopAnalyze(sa) +#endif {} AnalysisContent::~AnalysisContent() @@ -48,6 +54,12 @@ AnalysisContent::~AnalysisContent() LDEBUG << "AnalysisContent::~AnalysisContent all data deleted"; #endif } +#ifdef ANTINNO_SPECIFIC +StopAnalyze const& AnalysisContent::stopAnalyze() const +{ + return _stopAnalyze; +} +#endif AnalysisData* AnalysisContent::getData( const std::string& id) diff --git a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h index b50a560ce..6db8efc1c 100644 --- a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h +++ b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h @@ -48,8 +48,11 @@ class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisData class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisContent { public: - - AnalysisContent(); + AnalysisContent( +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ); /** * Destroy all AnalysisData in AnalysisContent @@ -102,9 +105,15 @@ class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisContent */ void releaseData(const std::string& id); +#ifdef ANTINNO_SPECIFIC + StopAnalyze const& stopAnalyze() const; +#endif private: std::map m_analysisData; +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& _stopAnalyze; +#endif }; diff --git a/lima_common/src/common/QsLog/QsLog.cpp b/lima_common/src/common/QsLog/QsLog.cpp index 83b2754b6..c75ad912a 100644 --- a/lima_common/src/common/QsLog/QsLog.cpp +++ b/lima_common/src/common/QsLog/QsLog.cpp @@ -34,6 +34,13 @@ #include #include + +#ifdef ANTINNO_SPECIFIC +#include +#include +#include +#endif + LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str ) { qd << str.c_str(); @@ -42,6 +49,69 @@ LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str namespace QsLogging { + +#ifdef ANTINNO_SPECIFIC + +namespace antinno { + +::boost::shared_ptr log; + + + + +Log4cpp::Log4cpp() +{ +} +void Log4cpp::configure(::std::string const& configFilePath) +{ + ::log4cpp::PropertyConfigurator::configure(configFilePath); + // todo : rcuprer le vrai msg de l'erreur + if (!::log4cpp::Appender::reopenAll()) + { + ::std::ostringstream oss; + oss << "log4cpp::Appender::reopenAll() return false. Maybe a problem with file " << configFilePath; + throw ::std::exception(oss.str().data()); + } +} +bool Log4cpp::canWrite(CategoryId const& id, Level level) const +{ + return ::log4cpp::Category::getInstance(id).isPriorityEnabled(level); +} +void Log4cpp::writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String) +{ + ::log4cpp::Category::getInstance(id) << level << pNullTerminatedUtf8String; +} + + + + + +LogHelper::LogHelper(QsLogging::Level l, const QString& zone) + :_zone(zone.toStdString()), _level(l) +{ +} +::std::ostream& LogHelper::stream() +{ + return _stream; +} +LogHelper::~LogHelper() +{ + auto l = info; + switch(_level) + { + case QsLogging::TraceLevel: l = debug; break; + case QsLogging::DebugLevel: l = debug; break; + case QsLogging::InfoLevel: l = info; break; + case QsLogging::WarnLevel: l = warn; break; + case QsLogging::ErrorLevel: l = error; break; + case QsLogging::FatalLevel: l = fatal; break; + } + log->writeRecord(CategoryId(_zone.c_str()), l, _stream.str().c_str()); +} + +} +#endif + typedef QList DestinationList; static const char TraceString[] = "TRACE"; @@ -168,8 +238,8 @@ void Logger::Helper::writeToLog() QTextStream ts(&s); ts << QThread::currentThread(); const QString completeMessage(QString("%1 %2 %3 %4") - .arg(levelName, 5) .arg(QDateTime::currentDateTime().toString(fmtDateTime)) + .arg(levelName, 5) .arg(s) .arg(buffer) ); diff --git a/lima_common/src/common/QsLog/QsLog.h b/lima_common/src/common/QsLog/QsLog.h index 692e1d55f..ff4967728 100644 --- a/lima_common/src/common/QsLog/QsLog.h +++ b/lima_common/src/common/QsLog/QsLog.h @@ -38,9 +38,16 @@ #include "QsLog_export.h" +#ifdef ANTINNO_SPECIFIC +#include +#include +#endif + namespace QsLogging { + + enum Level { TraceLevel = 0, @@ -128,6 +135,61 @@ class LIMA_COMMONQSLOG_EXPORT Logger LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str ); + + + + + +#ifdef ANTINNO_SPECIFIC + +namespace antinno { + +typedef ::std::string CategoryId; + +enum LIMA_COMMONQSLOG_EXPORT Level // identiques ceux de log4cpp +{ + emerg = 0, fatal = 0, alert = 100, crit = 200, error = 300, warn = 400, notice = 500, info = 600, debug = 700 +}; + +class LIMA_COMMONQSLOG_EXPORT ILog +{ +public: + virtual void configure(::std::string const& configFilePath) = 0; + virtual bool canWrite(CategoryId const& id, Level level) const = 0; + virtual void writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String) = 0; +}; + + +class LIMA_COMMONQSLOG_EXPORT Log4cpp : public ILog +{ +public: + Log4cpp(); + void configure(::std::string const& configFilePath); + bool canWrite(CategoryId const& id, Level level) const; + void writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String); +}; + +extern LIMA_COMMONQSLOG_EXPORT ::boost::shared_ptr log; + +class LIMA_COMMONQSLOG_EXPORT LogHelper +{ +public: + explicit LogHelper(QsLogging::Level logLevel, const QString& zone); + ~LogHelper(); + ::std::ostream& stream(); +private: + QsLogging::Level const _level; + ::std::ostringstream _stream; + ::std::string const _zone; +}; + +} +#endif + + + + + } // end namespace //! Logging macros: define QS_LOG_LINE_NUMBERS to get the file and line number diff --git a/lima_common/src/common/QsLog/QsLogCategories.cpp b/lima_common/src/common/QsLog/QsLogCategories.cpp index 6ca3a46ff..755e0ddb3 100644 --- a/lima_common/src/common/QsLog/QsLogCategories.cpp +++ b/lima_common/src/common/QsLog/QsLogCategories.cpp @@ -1,184 +1,242 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -#include "QsLogCategories.h" -#include "common/tools/LimaFileSystemWatcher.h" - -#ifdef WIN32 -#pragma warning(disable: 4127) -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace Lima; - -namespace QsLogging -{ - -// static const int init = initQsLog(); - -class CategoriesImpl -{ -public: - CategoriesImpl() - { - } - QMap categories; - - LimaFileSystemWatcher m_configFileWatcher; -}; - -Categories::Categories(QObject* parent) : - QObject(parent), - d(new CategoriesImpl()) -{ - connect(&d->m_configFileWatcher,SIGNAL(fileChanged(QString)),this,SLOT(configureFileChanged(QString))); -} - -Categories::~Categories() -{ - delete d; -} - -void Categories::configureFileChanged ( const QString & path ) -{ - if (QFile(path).exists()) - { - configure(path); - } -} - -bool Categories::configure(const QString& fileName) -{ - QFile file(fileName); - QFileInfo fileInfo(fileName); - QDir configDir = fileInfo.dir(); - - if (configDir.exists("log4cpp")) - { - QString log4cppSubdirName = configDir.filePath("log4cpp"); - QFileInfo log4cppSubdirInfo(log4cppSubdirName); - if (log4cppSubdirInfo.isDir()) - { - QStringList nameFilters; - nameFilters << "log4cpp.*.properties"; - QDir log4cppSubdir(log4cppSubdirName); - QFileInfoList configFiles = log4cppSubdir.entryInfoList(nameFilters); - Q_FOREACH(QFileInfo configFile, configFiles) - { - configure(configFile.absoluteFilePath()); - } - } - } - - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Unable to open qslog configuration file: " << fileName.toUtf8().data() << std::endl; - return false; - } - d->m_configFileWatcher.addPath(fileName); - - bool res = true; - QTextStream in(&file); - QString line = in.readLine(); - while (!line.isNull()) - { - if (!line.startsWith("#")) - { - QStringList elts = line.split("="); - if (elts.size()==2 && elts.at(0).trimmed().startsWith("log4j.category.")) - { - QString category = elts.at(0).trimmed().remove(0,QString("log4j.category.").size()); - QString value = elts.at(1).trimmed(); - if (value == "TRACE") - d->categories.insert(category,QsLogging::TraceLevel); - else if (value == "DEBUG") - d->categories.insert(category,QsLogging::DebugLevel); - else if (value == "INFO") - d->categories.insert(category,QsLogging::InfoLevel); - else if (value == "WARN") - d->categories.insert(category,QsLogging::WarnLevel); - else if (value == "ERROR") - d->categories.insert(category,QsLogging::ErrorLevel); - else if (value == "FATAL") - d->categories.insert(category,QsLogging::FatalLevel); - else - { - std::cerr << "Error reading " << fileName.toUtf8().constData() << ": unknow level " << value.toUtf8().constData() << ". Using TRACE" << std::endl; - res = false; - d->categories.insert(category,QsLogging::TraceLevel); - } - } - else if (elts.size()==2 && elts.at(0).trimmed() == "include") - { - QString includedFileName = elts.at(1).trimmed(); - QString includedInitFileName = includedFileName; - if (!QFileInfo(includedInitFileName).isAbsolute()) - { - includedInitFileName = configDir.filePath(includedInitFileName); - } - configure(includedInitFileName); - } - } - line = in.readLine(); - } - return res; -} - -Level Categories::levelFor(const QString& category) const -{ -#ifdef DEBUG_CD - // Do not compile this costly check in release - if (!d->categories.contains(category)) - { - std::cerr << "Error: unknown category. Using TRACE for " << category.toUtf8().constData() << std::endl; - } -#endif - return d->categories.value(category, QsLogging::TraceLevel); -} - -LIMA_COMMONQSLOG_EXPORT int initQsLog(const QString& configDir) { - try { - QString initFileName = (configDir.isEmpty() ? - QString::fromUtf8(qgetenv("LIMA_CONF").isEmpty() ? - "/usr/share/config/lima" : - qgetenv("LIMA_CONF").constData()) : - configDir ) + "/log4cpp.properties"; - if (!QsLogging::Categories::instance().configure(initFileName)) - { - std::cerr << "Configure Problem " << initFileName.toUtf8().constData() << std::endl; - return -1; - } - // } -} catch(...) { - std::cerr << "Exception during logging system configuration" << std::endl; - return -1; -} -return 0; -} - -} // end namespace - +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#include "QsLogCategories.h" +#include "common/tools/LimaFileSystemWatcher.h" +#include "common/tools/FileUtils.h" + +#ifdef WIN32 +#pragma warning(disable: 4127) +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Lima; +using namespace Lima::Common::Misc; + +namespace QsLogging +{ + +// static const int init = initQsLog(); + +class CategoriesImpl +{ +public: + CategoriesImpl() + { + } + QMap categories; + + LimaFileSystemWatcher m_configFileWatcher; +}; + +Categories::Categories(QObject* parent) : + QObject(parent), + d(new CategoriesImpl()) +{ + connect(&d->m_configFileWatcher,SIGNAL(fileChanged(QString)),this,SLOT(configureFileChanged(QString))); + QString category = "FilesReporting"; +#ifdef DEBUG_CD + d->categories.insert(category,QsLogging::InfoLevel); +#else + d->categories.insert(category,QsLogging::ErrorLevel); +#endif +} + +Categories::~Categories() +{ + delete d; +} + +void Categories::configureFileChanged ( const QString & path ) +{ + if (QFile(path).exists()) + { + configure(path); + } +} + +bool Categories::configure(const QString& fileName) +{ + QFile file(fileName); + QFileInfo fileInfo(fileName); + QDir configDir = fileInfo.dir(); + +// if (configDir.exists("log4cpp")) +// { +// QString log4cppSubdirName = configDir.filePath("log4cpp"); +// QFileInfo log4cppSubdirInfo(log4cppSubdirName); +// if (log4cppSubdirInfo.isDir()) +// { +// QStringList nameFilters; +// nameFilters << "log4cpp.*.properties"; +// QDir log4cppSubdir(log4cppSubdirName); +// QFileInfoList configFiles = log4cppSubdir.entryInfoList(nameFilters); +// Q_FOREACH(QFileInfo configFile, configFiles) +// { +// configure(configFile.absoluteFilePath()); +// } +// } +// } + + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Unable to open qslog configuration file: " << fileName.toUtf8().data() << std::endl; + return false; + } + d->m_configFileWatcher.addPath(fileName); + + bool res = true; + QTextStream in(&file); + QString line = in.readLine(); + while (!line.isNull()) + { + if (!line.startsWith("#")) + { + QStringList elts = line.split("="); + if (elts.size()==2 && elts.at(0).trimmed().startsWith("log4j.category.")) + { + QString category = elts.at(0).trimmed().remove(0,QString("log4j.category.").size()); + QString value = elts.at(1).trimmed(); + if (value == "TRACE") + d->categories.insert(category,QsLogging::TraceLevel); + else if (value == "DEBUG") + d->categories.insert(category,QsLogging::DebugLevel); + else if (value == "INFO") + d->categories.insert(category,QsLogging::InfoLevel); + else if (value == "WARN") + d->categories.insert(category,QsLogging::WarnLevel); + else if (value == "ERROR") + d->categories.insert(category,QsLogging::ErrorLevel); + else if (value == "FATAL") + d->categories.insert(category,QsLogging::FatalLevel); + else + { + std::cerr << "Error reading " << fileName.toUtf8().constData() << ": unknow level " << value.toUtf8().constData() << ". Using TRACE" << std::endl; + res = false; + d->categories.insert(category,QsLogging::TraceLevel); + } + } + else if (elts.size()==2 && elts.at(0).trimmed() == "include") + { + QString includedFileName = elts.at(1).trimmed(); + QString includedInitFileName = includedFileName; + if (!QFileInfo(includedInitFileName).isAbsolute()) + { + includedInitFileName = configDir.filePath(includedInitFileName); + } + configure(includedInitFileName); + } + } + line = in.readLine(); + } + LOGINIT("FilesReporting"); + LINFO << "QsLog conf file loaded:" << fileName; + return res; +} + +Level Categories::levelFor(const QString& category) const +{ +#ifdef DEBUG_CD + // Do not compile this costly check in release + if (!d->categories.contains(category)) + { + std::cerr << "Error: unknown category. Using TRACE for " << category.toUtf8().constData() << std::endl; + } +#endif + return d->categories.value(category, QsLogging::TraceLevel); +} + +LIMA_COMMONQSLOG_EXPORT int initQsLog(const QString& configString) +{ + bool atLeastOneSuccessfulLoad = false; + QStringList configDirsList; + if (configString.isEmpty()) + { + configDirsList = buildConfigurationDirectoriesList(QStringList()<<"lima",QStringList()); + } + else + { + configDirsList = configString.split(LIMA_PATH_SEPARATOR); + } + try + { + while (! configDirsList.isEmpty() ) + { + QString configDir = configDirsList.last(); + configDirsList.pop_back(); + QDir initDir( configDir + "/log4cpp"); + if (initDir.exists()) + { + QStringList entryList = initDir.entryList(QDir::Files); + Q_FOREACH(QString entry, entryList) + { + if (QsLogging::Categories::instance().configure(configDir + "/log4cpp/" + entry)) + { + atLeastOneSuccessfulLoad = true; + } + else + { + std::cerr << "Configure Problem " << entry.toUtf8().constData() << std::endl; + return -1; + } + } + } + QString initFileName = configDir + "/log4cpp.properties"; +#ifdef ANTINNO_BUGFIX + // QFileInfo::exists(...) ne fonctionne pas avec qt 4.8 + if (QFileInfo(initFileName).exists()) +#else + if (QFileInfo::exists(initFileName)) +#endif + { + if (QsLogging::Categories::instance().configure(initFileName)) + { + atLeastOneSuccessfulLoad = true; + } + else + { + std::cerr << "Configure Problem " << initFileName.toUtf8().constData() << std::endl; + return -1; + } + } + } + } + catch(...) + { + std::cerr << "Exception during logging system configuration" << std::endl; + return -1; +} + if (!atLeastOneSuccessfulLoad) + { + std::cerr << "Configure Problem no configure file has been found in" << configString.toStdString() << std::endl; + return -1; + } +return 0; +} + +} // end namespace + diff --git a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp index 164201647..fb634df8f 100644 --- a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp +++ b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp @@ -113,6 +113,11 @@ XMLConfigurationFileParserPrivate::XMLConfigurationFileParserPrivate(const strin LERROR << "Error parsing " << m_configurationFileName.c_str(); throw XMLException(std::string("XMLConfigurationFileParser Unable to parse ") + m_configurationFileName + " : " + m_parser->errorHandler()->errorString().toUtf8().constData()); } + { + LOGINIT("FilesReporting"); + LINFO << "File parsed:" << m_configurationFileName; + } + } XMLConfigurationFileParserPrivate::~XMLConfigurationFileParserPrivate() diff --git a/lima_common/src/common/misc/AbstractAccessIterators.cpp b/lima_common/src/common/misc/AbstractAccessIterators.cpp index d4fab5249..cd173e76e 100644 --- a/lima_common/src/common/misc/AbstractAccessIterators.cpp +++ b/lima_common/src/common/misc/AbstractAccessIterators.cpp @@ -64,11 +64,6 @@ AccessSubWordIterator& AccessSubWordIterator::operator=(const AccessSubWordItera AccessSubWordIterator::~AccessSubWordIterator() { -#ifdef DEBUG_CD - STRINGMAPLOGINIT; - LDEBUG << this << ": AccessSubWordIterator::~AccessSubWordIterator() " - ; -#endif delete m_delegate; } @@ -106,11 +101,6 @@ AccessSuperWordIterator& AccessSuperWordIterator::operator=(const AccessSuperWor } AccessSuperWordIterator::~AccessSuperWordIterator() { -#ifdef DEBUG_CD - STRINGMAPLOGINIT; - LDEBUG << this << ": AccessSuperWordIterator::~AccessSuperWordIterator() " - ; -#endif delete m_delegate; } diff --git a/lima_common/src/common/misc/stringspool.cpp b/lima_common/src/common/misc/stringspool.cpp index 88e7ddebc..ddf5dbe16 100644 --- a/lima_common/src/common/misc/stringspool.cpp +++ b/lima_common/src/common/misc/stringspool.cpp @@ -271,6 +271,11 @@ void StringsPoolPrivate::clear() // reinit from pos to the end void StringsPoolPrivate::clear(const uint64_t pos) { + // reinitialize hashPool + // WARNING: The m_hashPool hash table contains the same pointer as the m_vecPool + // vector. So, override its content BEFORE free memory to avoid crash (on Windows) + m_hashPool=m_resourcesHashPool; + // STRPOOLLOGINIT; // LDEBUG << "clearing StringsPool"; uint64_t i(pos),size(m_vecPool.size()); @@ -280,8 +285,6 @@ void StringsPoolPrivate::clear(const uint64_t pos) m_vecPool[i] = 0; } m_vecPool.resize(pos); - // reinitialize hashPool - m_hashPool=m_resourcesHashPool; } #ifndef WIN32 diff --git a/lima_common/src/common/misc/stringspool.h b/lima_common/src/common/misc/stringspool.h index c16826adc..935236d01 100644 --- a/lima_common/src/common/misc/stringspool.h +++ b/lima_common/src/common/misc/stringspool.h @@ -42,7 +42,7 @@ #include #endif #endif -#include +#include namespace Lima { diff --git a/lima_common/src/common/time/timeUtilsController.cpp b/lima_common/src/common/time/timeUtilsController.cpp index 784bde39f..efe95e1ea 100644 --- a/lima_common/src/common/time/timeUtilsController.cpp +++ b/lima_common/src/common/time/timeUtilsController.cpp @@ -41,7 +41,12 @@ TimeUtilsController::~TimeUtilsController() { uint64_t delta = TimeUtils::elapsedTime(m_topic); if (m_logElapsedTime) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + // FWI 09/11/2015 gestion temps en microsecondes sous windows + LINFO << m_topic << " ( ): " << delta << " us"; +#else LINFO << m_topic << " ( ): " << delta << " ms"; +#endif } } diff --git a/lima_common/src/common/time/traceUtils.cpp b/lima_common/src/common/time/traceUtils.cpp index 4249a03cf..306ecef14 100644 --- a/lima_common/src/common/time/traceUtils.cpp +++ b/lima_common/src/common/time/traceUtils.cpp @@ -29,30 +29,105 @@ #include "traceUtils.h" #include +#ifdef ANTINNO_SPECIFIC +// FWI 28/10/2015 modifs pour utiliser une horloge plus prcise (en us au lieu de ms) sous windows +// + ajout d'un compteur + +#ifdef WIN32 +#include "Windows.h" +#include + + +LARGE_INTEGER m_f; +static bool m_freqInit = false; + +namespace +{ + uint64_t _winTime() + { + LARGE_INTEGER i; + if (m_freqInit == false) + { + QueryPerformanceFrequency(&m_f); + m_freqInit = true; + } + QueryPerformanceCounter(&i); + + return (i.QuadPart * 1000000) / m_f.QuadPart; // microseconds + } +} +#else +#error no implementation for non-win32 systems +#endif +#endif + + namespace Lima { //********************************************************************** //initialization of static members //********************************************************************** // uint64_t TimeUtils::currentTime={0,0}; +#ifdef ANTINNO_SPECIFIC +std::map TimeUtils::m_cumulatedTime = std::map(); +#else std::map > TimeUtils::m_cumulatedTime = std::map >(); +#endif QMutex TimeUtils::m_mutex; + +#ifdef ANTINNO_SPECIFIC +TimeUtils::TimeUtils() +{ +} +#endif //********************************************************************** // member functions //********************************************************************** -uint64_t TimeUtils::getCurrentTime() { +uint64_t TimeUtils::getCurrentTime() { +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + return _winTime(); +#else +#error no implementation for non-win32 systems +#endif +#else return QDateTime::currentMSecsSinceEpoch(); +#endif } +#ifdef ANTINNO_SPECIFIC +// FWI 03/11/24 nouvelle mthode pour remettre zro le cumul +void TimeUtils::restart( const std::string& taskCategory) +{ + QMutexLocker locker(&m_mutex); +#ifdef WIN32 + m_cumulatedTime[taskCategory].first = _winTime(); + //cout << "updateCurrentTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#else + m_cumulatedTime[taskCategory].first = QDateTime::currentMSecsSinceEpoch(); +#endif + m_cumulatedTime[taskCategory].second = 0; + m_cumulatedTime[taskCategory].count = 0; +} +#endif void TimeUtils::updateCurrentTime( const std::string& taskCategory ) { QMutexLocker locker(&m_mutex); +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + m_cumulatedTime[taskCategory].first = _winTime(); + //cout << "updateCurrentTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#else +#error no implementation for non-win32 systems +#endif +#else m_cumulatedTime[taskCategory].first = QDateTime::currentMSecsSinceEpoch(); +#endif } // void TimeUtils::updateCurrentTime() { -// boost::mutex::scoped_lock(m_mutex); +// ::boost::mutex::scoped_lock(m_mutex); // gettimeofday(¤tTime,0); // } @@ -67,10 +142,32 @@ uint64_t TimeUtils::diffTime(const uint64_t& begin, } uint64_t TimeUtils::elapsedTime(const std::string& taskCategory) { +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + uint64_t newTime = _winTime(); +#else +#error no implementation for non-win32 systems +#endif +#else uint64_t newTime = QDateTime::currentMSecsSinceEpoch(); +#endif + //cout << "newTime=" << newTime << ::std::endl; + //cout << "oldTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + uint64_t delta = newTime - m_cumulatedTime[taskCategory].first; +#else +#error no implementation for non-win32 systems +#endif +#else uint64_t delta = diffTime(m_cumulatedTime[taskCategory].first,newTime); +#endif + //cout << "delta=" << delta << ::std::endl; m_cumulatedTime[taskCategory].second += delta; m_cumulatedTime[taskCategory].first = newTime; +#ifdef ANTINNO_SPECIFIC + ++m_cumulatedTime[taskCategory].count; + #endif return delta; } @@ -80,7 +177,11 @@ uint64_t TimeUtils::elapsedTime(const std::string& taskCategory) { void TimeUtils::logElapsedTime(const std::string& mess, const std::string& taskCategory) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + LINFO << mess << "(" << taskCategory << "): " << TimeUtils::elapsedTime(taskCategory) << " us"; +#else LINFO << mess << "(" << taskCategory << "): " << TimeUtils::elapsedTime(taskCategory) << " ms"; +#endif } /** @@ -89,15 +190,27 @@ void TimeUtils::logElapsedTime(const std::string& mess, void TimeUtils::logCumulatedTime(const std::string& mess, const std::string& taskCategory) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + LINFO << std::setfill('0') << std::setw(9) << m_cumulatedTime[taskCategory].second << " us" + << " count : " << std::setfill('0') << std::setw(6) << m_cumulatedTime[taskCategory].count << ": " << mess; +#else LINFO << mess << ": " << m_cumulatedTime[taskCategory].second << " ms"; +#endif } void TimeUtils::logAllCumulatedTime(const std::string& mess) { TIMELOGINIT; LINFO << mess << ": "; + +#ifdef ANTINNO_SPECIFIC + for( std::map::const_iterator it = m_cumulatedTime.begin() ; + it != m_cumulatedTime.end() ; it++ ) { + LINFO << it->first << ":" << it->second.second << " us" << " count: " << it->second.count; +#else for( std::map >::const_iterator it = m_cumulatedTime.begin() ; it != m_cumulatedTime.end() ; it++ ) { LINFO << it->first << ":" << it->second.second << " ms" ; +#endif } } diff --git a/lima_common/src/common/time/traceUtils.h b/lima_common/src/common/time/traceUtils.h index 641a87afb..137ae10ef 100644 --- a/lima_common/src/common/time/traceUtils.h +++ b/lima_common/src/common/time/traceUtils.h @@ -54,7 +54,16 @@ namespace Lima { class LIMA_TIME_EXPORT TimeUtils { public: - TimeUtils() {} +#ifdef ANTINNO_SPECIFIC + // FWI 04/11/2015 ajout classe + struct Data + { + uint64_t first; + uint64_t second; + uint64_t count; + }; +#endif + TimeUtils(); ~TimeUtils() {} /** @@ -64,6 +73,10 @@ namespace Lima { */ // static void updateCurrentTime( const std::string& taskCategory = std::string("") ); static void updateCurrentTime( const std::string& taskCategory = std::string("") ); +#ifdef ANTINNO_SPECIFIC + // FWI 03/11/24 nouvelle mthode + static void restart( const std::string& taskCategory = std::string("") ); +#endif // static void setCurrentTime(uint64_t time); static void setCurrentTime(uint64_t time, const std::string& taskCategory = std::string("")); @@ -111,8 +124,14 @@ namespace Lima { private: /** last current time stored */ // static uint64_t currentTime; +#ifdef ANTINNO_SPECIFIC + // FWI 04/11/2015 remplacement de pair par Data + static std::map m_cumulatedTime; +#else static std::map > m_cumulatedTime; +#endif static QMutex m_mutex; + }; } // end namespace diff --git a/lima_common/src/common/tools/FileUtils.cpp b/lima_common/src/common/tools/FileUtils.cpp new file mode 100644 index 000000000..155d23de6 --- /dev/null +++ b/lima_common/src/common/tools/FileUtils.cpp @@ -0,0 +1,201 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file FileUtils.h + * @author Gael de Chalendar + * @date Tue Jul 7 2015 + * copyright Copyright (C) 2015 by CEA LIST + ***********************************************************************/ + +#include "FileUtils.h" +#ifdef ANTINNO_SPECIFIC +#include +#endif +#include +#include + +namespace Lima { +namespace Common { +namespace Misc { + +uint64_t countLines(std::istream& file) +{ + uint64_t result = 0; + std::streampos initialPosition = file.tellg(); + int c = file.get(); + while (c != -1) + { + while (c != -1 && c != '\n') + { + c = file.get(); + } + result = result + 1; + c = file.get(); + } + file.clear(); + file.seekg(initialPosition, std::ios_base::beg); + return result; +} + +uint64_t countLines(QFile& file) +{ + uint64_t result = 0; + qint64 initialPosition = file.pos(); + char c = '\0'; + while (!file.atEnd()) + { + while (!file.atEnd() && c != '\n') + { + file.getChar(&c); + } + result = result + 1; + file.getChar(&c); + } + file.seek(initialPosition); + return result; +} + +QStringList buildConfigurationDirectoriesList(const QStringList& projects, const QStringList& paths) +{ + QStringList configDirs; +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& project, projects) +#else + for (const QString& project: projects) +#endif + { + QStringList confDirs; + QString projectConf = QString::fromUtf8(qgetenv((project.toUpper()+"_CONF").toStdString().c_str()).constData()); + if (!projectConf.isEmpty()) + confDirs << projectConf.split(LIMA_PATH_SEPARATOR); +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString &configDir, confDirs) +#else + for (const QString &configDir: confDirs ) +#endif + { + if (!configDir.isEmpty() && QDir(configDir).exists()) + { + configDirs << configDir; + } + } + if (confDirs.isEmpty()) + { + QString configDir = QString::fromUtf8(qgetenv((project.toUpper()+"_DIST").toStdString().c_str()).constData()) + "/share/config/" + project; + if (!configDir.isEmpty() && QDir( configDir ).exists() ) + { + configDirs << configDir; + } + else + { + configDir = QString::fromUtf8("/usr/share/config/") + project; + if (!configDir.isEmpty() && QDir( configDir ).exists() ) + { + configDirs << configDir; + } + } + } + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& path, paths) +#else + for (const QString& path: paths) +#endif + { + if (!path.isEmpty() && QDir(path).exists()) + configDirs << path; + } + + return configDirs; +} + +QStringList buildResourcesDirectoriesList(const QStringList& projects, const QStringList& paths) +{ + QStringList resourcesDirs; +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& project, projects) +#else + for (const QString& project: projects) +#endif + { + QStringList resDirs; + QString projectRes = QString::fromUtf8(qgetenv((project.toUpper()+"_RESOURCES").toStdString().c_str()).constData()); + if (!projectRes.isEmpty()) + resDirs << projectRes.split(LIMA_PATH_SEPARATOR); +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString &resourcesDir, resDirs) +#else + for (const QString &resourcesDir: resDirs ) +#endif + { + if (QDir(resourcesDir).exists()) + { + resourcesDirs << resourcesDir; + } + } + if (resDirs.isEmpty()) + { + QString resourcesDir = QString::fromUtf8(qgetenv((project.toUpper()+"_DIST").toStdString().c_str()).constData()) + "/share/apps/" + project + "/resources"; + if ( QDir( resourcesDir ).exists() ) + { + resourcesDirs << resourcesDir; + } + else + { + resourcesDir = QString::fromUtf8("/usr/share/apps/") + project + "/resources"; + if ( QDir( resourcesDir ).exists() ) + { + resourcesDirs << resourcesDir; + } + } + } + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& path, paths) +#else + for (const QString& path: paths) +#endif + { + if (QDir(path).exists()) + resourcesDirs << path; + } + + return resourcesDirs; +} + +QString findFileInPaths(const QString& paths, const QString& fileName, const QChar& separator) +{ + QStringList pathsList = paths.split(separator); + Q_FOREACH(QString path, pathsList) + { + if (QFileInfo(path+ "/" + fileName).exists()) + { + return path+ "/" + fileName; + } + } + std::cerr << "WARNING: findFileInPaths no '" << fileName.toUtf8().constData() + << "' found in '" << paths.toUtf8().constData() + << "' separated by '" << separator.toLatin1() << "'" << std::endl; + return QString(); +} + + +} // end namespace +} // end namespace +} // end namespace diff --git a/lima_common/src/common/tools/FileUtils.h b/lima_common/src/common/tools/FileUtils.h new file mode 100644 index 000000000..a38507970 --- /dev/null +++ b/lima_common/src/common/tools/FileUtils.h @@ -0,0 +1,130 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file FileUtils.h + * @author Gael de Chalendar + * @date Tue Jul 7 2015 + * copyright Copyright (C) 2015 by CEA LIST + ***********************************************************************/ + +#ifndef LIMA_COMMON_MISC_FILEUTILS_H +#define LIMA_COMMON_MISC_FILEUTILS_H + +#include "common/LimaCommon.h" + +#include +#ifdef ANTINNO_BUGFIX +// ncessaire sinon le compilateur dit que QStringList n'a pas de constructeur... +#include +#endif + +#include + +#ifdef WIN32 +#ifdef ANTINNO_SPECIFIC +// ncessaire sinon on a une erreur c2664 : impossible de convertir de 'char' 'Qstring' dans 'QString::join' +static QChar const LIMA_PATH_SEPARATOR(';'); +#else +#define LIMA_PATH_SEPARATOR ';' +#endif +#else +#define LIMA_PATH_SEPARATOR ':' +#endif + +namespace Lima { +namespace Common { +namespace Misc { + +/** + * Count the number of lines in the given file from the current position + * + * If the last line has no character (no character after the last line break)' it is not counted. + * After this function, the file is in the same good state and at the same position. + * + * @param file the file to count the lines of + * + * @return the number of lines of the file + */ +LIMA_COMMONTOOLS_EXPORT uint64_t countLines(std::istream& file); + +/** + * Count the number of lines in the given file from the current position + * + * If the last line has no character (no character after the last line break)' it is not counted. + * After this function, the file is at the same position. + * + * @param file the file to count the lines of + * + * @return the number of lines of the file + */ +LIMA_COMMONTOOLS_EXPORT uint64_t countLines(QFile& file); + + +/** + * @brief Build a list of configuration directories from a list of project + * names and a list of paths. + * + * For each project name "project", try to add the dir from the environment + * variable $PROJECT_CONF. If it does not exist, try + * $PROJECT_DIST/share/config/project. If it does not exist either, try + * /usr/share/config/project. + * Then add existing paths from the given list. + * In LIMA the projects list will be limited to the single element "lima" but + * projects depending on LIMA will be able to add their own separate + * configurations. + * + * @param projects The list of project names to explore + * @param paths The list of paths to look into. + */ +LIMA_COMMONTOOLS_EXPORT QStringList buildConfigurationDirectoriesList(const QStringList& projects, + const QStringList& paths = QStringList() ); + +/** + * @brief Build a list of resources directories from a list of project names + * and a list of paths. + * + * For each project name "project", try to add the dir from the environment + * variable $PROJECT_RESOURCES. If it does not exist, try + * $PROJECT_DIST/share/apps/project/resources. If it does not exist either, try + * /usr/share/apps/project/resources. + * Then add existing paths from the given list. + * In LIMA the projects list will be limited to the single element "lima" but + * projects depending on LIMA will be able to add their own separate + * resources. + * + * @param projects The list of project names to explore + * @param paths The list of paths to look into. + */ +LIMA_COMMONTOOLS_EXPORT QStringList buildResourcesDirectoriesList(const QStringList& projects, + const QStringList& paths = QStringList()); + +/** + * Find the given file in the given paths. + * @param paths the list of concatenated paths to search th file in + * @param fileName the name of the file to search into the paths. Can include a relative path + * @param separator the character used to split the list of paths. Defaults to semicolon + * @return the full path of the found file if found. Empty string otherwise. + */ +LIMA_COMMONTOOLS_EXPORT QString findFileInPaths(const QString& paths, const QString& fileName, const QChar& separator = LIMA_PATH_SEPARATOR); + +} // end namespace +} // end namespace +} // end namespace + +#endif diff --git a/lima_common/test/testFsaDict16.cpp b/lima_common/test/testFsaDict16.cpp index 79697b443..d58fd24d5 100644 --- a/lima_common/test/testFsaDict16.cpp +++ b/lima_common/test/testFsaDict16.cpp @@ -1,924 +1,949 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - testFsaDict16.cpp - description - ------------------- - begin : lun jun 2 2003 - copyright : (C) 2003 by Olivier Mesnard - email : olivier.mesnard@cea.fr -// ***************************************************************************/ - -/*************************************************************************** - * * - * compact dictionnary based on finite state automata * - * implemented with Boost Graph library * - * * - ***************************************************************************/ -#include "common/LimaCommon.h" - -#include "common/time/traceUtils.h" - -// string and file handling Utilities -#include "common/Data/strwstrtools.h" - -// dictionaries -#include "common/FsaAccess/FsaAccessBuilder16.h" -#include "common/FsaAccess/FsaAccessBuilderRandom16.h" -#include "common/FsaAccess/FsaAccessSpare16.h" -#include "common/misc/AbstractAccessByString.h" - -#include - -// for set locale -#include -// for system() -#include - -#include -#include -#include -#include -#include -#include -#include - -// For ::stat() function -#include -#include -#ifndef WIN32 -#include -#endif -using namespace std; -using namespace Lima; -using namespace Lima::Common; - -int logFileSize( const std::string& filename ) { - struct stat sts; - if( stat( filename.c_str(), &sts) != 0) - std::cerr << "logFileSize: error getting info for file " << filename << std::endl; - std::cout << "taille fichier: " << filename << "= " << sts.st_size << std::endl; - return sts.st_size; -} - -void logMemsize( const string& legend ) { -#ifdef WIN32 - LIMA_UNUSED(legend); -#else - pid_t pid = getpid(); - ostringstream ostr; - ostr << "/proc/" << pid << "/status"; - ifstream statusFile(ostr.str().c_str(), std::ifstream::binary); - char strbuff[200]; - for( ; ; ) { - string status; - statusFile.getline(strbuff, 200, '\n' ); - string line(strbuff); - if(line.empty() ) - break; - string::size_type composed1_pos = line.find("VmSize:"); - if( composed1_pos != string::npos ) { - string vmSizeStr(line, composed1_pos+7); - int vmSize = atoi(vmSizeStr.c_str()); - std::cerr << legend << " VmSize:" << vmSize; - } - } -#endif -} - -int getProcStat( const std::string& toLog ) { -#ifdef WIN32 - LIMA_UNUSED(toLog); - return 0; -#else - std::string statusFile; - - ostringstream os; - os << "/proc/" << getpid() << "/status"; - statusFile=os.str(); - - ifstream statusIn(statusFile.c_str(),ios::in | std::ifstream::binary); - string line; - int val; - while (!statusIn.eof()) - { - getline(statusIn,line); -// std::cout << "line = " << line << std::endl; - size_t index=line.find(toLog); - if( index != std::string::npos ) { -// std::cout << "index = " << index << std::endl; - string valstr=line.substr(index+toLog.size()+1); -// std::cout << "valstr = " << valstr << std::endl; - val = atoi(valstr.c_str()); - std::cout << toLog << "=" << val < & listOfWords ) - { - std::ifstream wList(listOfWordsFilename.c_str(), std::ios::in | std::ios::binary ); - if ( !wList.is_open() ) { - std::cerr << "Cannot open list of words " << listOfWordsFilename << std::endl; - return EXIT_FAILURE; - } - std::cerr << "Read list of words" << std::endl; - char strbuff[200]; - - for( int counter = 0 ; ; counter++ ) { - // lecture d'une ligne du fichier - wList.getline(strbuff, 200, '\n' ); - string line(strbuff); - if( line.size() == 0 ) { - std::cerr << "end of list of words. counter=" << counter << std::endl; - break; - } - else { - // extraction cha�e - Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); - listOfWords.push_back(word); - } - } - return EXIT_SUCCESS; -} - - -template -class DictTester { - public: - DictTester(Param param, dictType &dico) : m_param(param), m_dico(dico) { - } - void exec( void ); - void testSub(std::vector& hyperwords, - std::vector& offsets, - std::vector > &subwords, bool withAssert ); - void testSuper(typename std::vector::const_iterator begin, - typename std::vector::const_iterator end ); - void testIndex( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ); - void testSpelling( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ); - void addListOfWords(); - void addListOfUnorderedWords(); - void write( void ); - private: - Param m_param; - dictType &m_dico; -}; - - -template -void DictTester::addListOfWords() { - - if( !m_param.listOfWords.size() ) - return; - - std::ifstream wList(m_param.listOfWords.c_str(), std::ios::in | std::ios::binary ); - if ( !wList.is_open() ) { - std::cerr << "Cannot open list of words " << m_param.listOfWords << std::endl; - return; - } - std::cerr << "Read list of words" << std::endl; - char strbuff[200]; - - for( int counter = 0 ; ; counter++ ) { - if( (counter%10000) == 0 ) { - ostringstream ostr; - ostr << "\naddListOfWords counter = " << counter; -// std::cerr << "addListOfWords counter = " << counter << std::endl; - logMemsize( ostr.str() ); - } - // lecture d'une ligne du fichier - wList.getline(strbuff, 200, '\n' ); - string line(strbuff); - if( wList.eof() ) - { - std::cerr << "end of list of words. counter=" << counter << std::endl; - break; - } - else if (!line.empty()) - { -// std::cerr << "addListOfWords: (" << line << ")" << std::endl; - Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); -// std::cerr << "addListOfWords: addWord(" << word << ")" << std::endl; - m_dico.addWord( word ); - } - } - std::cerr << std::endl; - m_dico.pack(); -} - -template -void DictTester::addListOfUnorderedWords() { - - if( m_param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - m_dico.printGraph(std::cerr); - } - - if( !m_param.listOfWords.compare(std::string("")) ) - return; - - std::vector listOfWords; - readListOfWords(m_param.listOfWords, listOfWords); - - int counter(0); - for( std::vector::iterator itWord = listOfWords.begin() ; - itWord != listOfWords.end() ; itWord++, counter++ ) { -// if( (counter%10000) == 0 ) { - std::cerr << "addListOfWords(" << *itWord << "), counter = " << counter << std::endl; -// } - m_dico.addRandomWord( *itWord ); - } - - if( m_param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - m_dico.printGraph(std::cerr); - } -// m_dico.pack(); -} - -template -void DictTester::testIndex( - typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ) { - std::cout << "testIndex: getSize() = " << m_dico.getSize() << std::endl; - - std::vector::const_iterator indexItr = indexes.begin(); - int index0 = 0; - - for( typename std::vector::const_iterator lemma = begin ; - lemma != end ; lemma++ ) { - // recup�ation de l'index �partir de la cha�e de caract�es - int index = m_dico.getIndex(*lemma); - // traces - if( index%10000 == 0 ) { - ostringstream ostr; - ostr << "testIndex index = " << index; -// std::cerr << "addListOfWords counter = " << counter << std::endl; - logMemsize( ostr.str() ); - } - if( m_param.withDebug ) { - Lima::LimaString newWord = *lemma; - std::cout << "testIndex: getIndex(" - << Lima::Common::Misc::limastring2utf8stdstring(newWord) - << ")=" << index << std::endl; - } - else { - if( index%10000 == 1 ) { - Lima::LimaString newWord = *lemma; - std::cout << "testIndex: getIndex(" << Lima::Common::Misc::limastring2utf8stdstring(newWord) - << ")=" << index << std::endl; - } - } - // result verification - if( m_param.withAssert ) { - if( indexItr != indexes.end() ) { -// std::cerr << "check " << index << "!=" << *indexItr << std::endl; - assert( index == *indexItr); - indexItr++; - } - else { -// std::cerr << "check " << index << "!=" << index0+1 << std::endl; - assert( index == index0+1 ); - index0 = index; - } - } - } - - // test sur chaine n'existant pas - for( typename std::vector::const_iterator lemma = begin ; - lemma != end ; lemma++ ) { - int index = m_dico.getIndex(*lemma); - Lima::LimaString invertedLemma; - for( int i = (*lemma).size()-1; i >= 0 ; i-- ) { - invertedLemma.push_back((*lemma)[i]); - } - int invertedIndex = m_dico.getIndex(invertedLemma); - // traces - if( index%10000 == 0 ) { - ostringstream ostr; - ostr << "testIndex inverted (" - << Lima::Common::Misc::limastring2utf8stdstring(invertedLemma) - << ") index = " << invertedIndex; - logMemsize( ostr.str() ); - } - } -} - -template - void DictTester::testSpelling( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ) -//void DictTester::testSpelling( int *indexVal, int nbIndex ) -{ - LIMA_UNUSED(end); - typename std::vector::const_iterator lemma = begin; - - // if size of indexes = 1, we just display the string return by getSpelling() - std::cout << "testSpelling: getSpelling: indexes.size()=" << indexes.size() << std::endl; - if( indexes.size() == 1 ) { - Lima::LimaString spelling; - spelling = m_dico.getSpelling(indexes[0]); - std::cout << "testSpelling: getSpelling(" << indexes[0] - << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; - } - // for each id, compare result of getSpelling with element in vector of string [begin,end] - for( uint32_t i = 0 ; i < indexes.size() ; i++ ) { - Lima::LimaString spelling; - try{ - spelling = m_dico.getSpelling(indexes[i]); - if( i%10000 == 1 ) { - std::cout << "testSpelling: getSpelling(" << indexes[i] - << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; - } - if( m_param.withAssert ) { - assert( spelling == (*lemma) ); - } - } - catch(std::logic_error e ) { - std::cout << "testSpelling exception: " << e.what() << std::endl; - } - lemma++; - } -} - -template -void DictTester::testSuper( - typename std::vector::const_iterator begin, - typename std::vector::const_iterator end ) { - - for( typename std::vector::const_iterator it = begin ; - it != end ; it++ ) { - try{ - Lima::LimaString prefix = *it; - std::pair entries = - m_dico.getSuperWords(prefix); - std::cout << "testSuper: getSuperWords(" - << Lima::Common::Misc::limastring2utf8stdstring(prefix) - << ")" << std::endl; - for( ; entries.first != entries.second ; entries.first++ ) { - Lima::LimaString superWord = *(entries.first); - std::cout << Lima::Common::Misc::limastring2utf8stdstring(superWord) - << ", " << std::endl; - } - std::cout << std::endl; - } - catch(std::logic_error e ) { - std::cout << "testSuper: getSuperWords exception: " << e.what() << std::endl; - } - } -} - -template - void DictTester::testSub( - std::vector & hyperwords, - std::vector & offsets, - std::vector >& subwords, bool withAssert ) { - - typename std::vector::iterator wordIt; - std::vector::iterator offsetIt = offsets.begin(); - typename std::vector >::iterator answersIt = subwords.begin(); - for( wordIt = hyperwords.begin(); wordIt != hyperwords.end() ; wordIt++ ) { - try{ - Lima::LimaString word = *wordIt; - std::pair entries = m_dico.getSubWords(*offsetIt,word); - FSAALOGINIT; - LDEBUG << "test getSubWords(" - << ", " << word << ")" ; - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - LINFO << "string(" << *offsetIt << "," << (*entry).first << "), "; - } - LINFO ; - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); - LINFO << subWord << ", "; - } - LINFO ; - if( withAssert ) { - // r�up�ation des r�onses attendues pour v�ifications - assert( answersIt != subwords.end() ); - std::vector answers = *(answersIt++); - typename std::vector::iterator answerIt = answers.begin(); - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - assert( answerIt != answers.end() ); - Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); - assert(!subWord.compare(*answerIt)); - answerIt++; - } - } - } - catch(std::logic_error e ) { - std::cout << "testSub: getSubWords exception: " << e.what() << std::endl; - } - offsetIt++; - } -} - -template -void DictTester::exec( void ) { - if( m_param.withDebug ) { - std::cerr << "Print dictionary...." << std::endl; - m_dico.print(std::cout); - } -} - -template -void DictTester::write( void ) { - try { - if( m_param.outputDico.size() > 0 ) { - std::cerr << "Write dictionary...." << std::endl; - m_dico.write(m_param.outputDico); - } - } - catch(LimaException e ) { - std::cout << "write: exception: " << e.what() << std::endl; - } -} - -int main(int argc, char *argv[]) -{ - QCoreApplication a(argc, argv); - QsLogging::initQsLog(); - - cerr << argv[0] << " begin..." << endl << " command line: "; - for (int i = 0; i < argc; i++) - { - std::cerr << argv[i] << " "; - } - std::cerr << std::endl; - - setlocale(LC_ALL, ""); -#ifdef DEBUG_CD - FSAALOGINIT; - LDEBUG << argv[0] << " begin..." ; -#endif - - // options reading - Param param = { - std::string(), // listOfWords - std::string(), // outputDico - std::string(), // inputDico - false, // subWord - std::string(), // listOfHyperwords - false, // superWord - false, // printGraph - false, // spareMem - one_byte, // charSize - false, // withoutTemplate - true, // trieDirectionForward - false, // withDebug - false, // runPerfo - false, // runIndex - false, // addWord - false, // runSpelling - -1, // termId (-1 means no termId specified by user) - false, // composed - false, // withAssert - std::string() // inputDico - }; - - for (int i = 1 ; i < argc; i++) { - QString arg = QString::fromUtf8(argv[i]); - int pos = -1; - if (arg == "--help") - { - std::cerr << "usage: " << argv[0] - << " --help" << std::endl; - std::cerr << " " << argv[0] - << " [--output=]" - << " [--input=]" - << " [--printGraph]" - << " [--subWord]" - << " [--listOfHyperwords=]" - << " [--listOfWords=]" - << " [--superWord]" - << " [--spare]" - << " [--runIndex]" - << " [--addWord]" - << " [--runSpelling]" - << " [--termId=nn" - << " [--composed=]" - << " [--charSize=<1|2|4>]" - << " [--withoutTemplate" - << " [--reverse]" - << " [--withDebug]" - << " [--runPerfo]" - << " [--withAssert]" - << std::endl; - return 0; - } - else if ( (pos = arg.indexOf("--input=")) != -1 ){ - param.inputDico = arg.mid(pos+8).toUtf8().data(); - } - else if ( (pos = arg.indexOf("--output=")) != -1 ){ - param.outputDico = arg.mid(pos+9).toUtf8().data(); - } - else if ( arg =="--printGraph" ){ - param.printGraph = true; - } - else if ( arg == "--subWord" ){ - param.subWord = true; - } - else if ( (pos = arg.indexOf("--listOfHyperwords=")) != -1 ){ - param.listOfHyperwords = arg.mid(pos+19).toUtf8().data(); - } - else if ( (pos = arg.indexOf("--listOfWords=")) != -1 ){ - param.listOfWords = arg.mid(pos+14).toUtf8().data(); - } - else if ( arg == "--superWord" ){ - param.superWord = true; - } - else if ( arg == "--withDebug" ){ - param.withDebug = true; - } - else if ( arg == "--runPerfo" ){ - param.runPerfo = true; - } - else if ( arg == "--withoutTemplate" ){ - param.withoutTemplate = true; - } - else if ( (pos = arg.indexOf("--charSize=")) != -1 ){ - int charSize = (arg.mid(pos+11)).toInt(); - switch(charSize) { - case 1: - param.charSize = one_byte; - break; - case 2: - param.charSize = two_bytes; - break; - case 4: - param.charSize = four_bytes; - break; - } - } - else if ( arg == "--spare" ){ - param.spareMem = true; - } - else if ( arg == "--runIndex" ){ - param.runIndex = true; - } - else if ( arg == "--addWord" ){ - param.addWord = true; - } - else if ( arg == "--runSpelling" ){ - param.runSpelling = true; - } - else if ( (pos = arg.indexOf("--termId=")) != -1 ){ - param.termId = (arg.mid(pos+9)).toInt(); - } - else if ( arg == "--reverse" ){ - param.trieDirectionForward = false; - } - else if ( (pos = arg.indexOf("--composed=")) != -1 ){ - param.composed = true; - param.inputDicoComp = arg.mid(pos+12).toUtf8().data(); - } - else if ( arg == "--withAssert" ){ - param.withAssert = true; - } - } - - cerr << argv[0] << ": "; - if(param.withDebug) - cerr << "--withDebug "; - if(param.runPerfo) - cerr << "--runPerfo "; - if(param.spareMem) - cerr << "--spare "; - if(param.runIndex) - cerr << "--runIndex "; - if(param.addWord) - cerr << "--addWord "; - if(param.runSpelling) - cerr << "--runSpelling "; - if(param.printGraph) - cerr << "--printGraph "; - if(!param.trieDirectionForward) - cerr << "--reverse "; - if(!param.withoutTemplate) - cerr << "--withoutTemplate "; - if(param.subWord) { - cerr << "--subWord "; - if(param.listOfHyperwords.size()){ - cerr << "--listOfHyperwords=" << param.listOfHyperwords << " "; - } - } - if(param.composed) - cerr << "--composed=" << param.inputDicoComp << " "; - cerr << "--charSize=" << param.charSize; - if(param.inputDico.size()) { - cerr << "--input='" << param.inputDico << "' "; - } - if(param.outputDico.size()) { - cerr << "--output='" << param.outputDico << "' "; - } - if(param.listOfWords.size()) { - cerr << "--listOfWords='" << param.listOfWords << "'"; - } - cerr << endl; - - DictTester *wspareTester16=0; - DictTester *wbuilderTester16=0; - DictTester *wbuilderRandomTester16=0; - - if( (!param.spareMem) && (param.addWord) ) { - // Si Builder avec option addWord: BuilderRandom - std::cerr << "Create BuilderRandom dictionary...." << std::endl; - Lima::Common::FsaAccess::FsaAccessBuilderRandom16 *dico=0; - if(param.trieDirectionForward) { - dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(); - } - else { - dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(false); - } - if( param.inputDico.size() > 0) { - std::cerr << "Read dictionary from file... " - << param.inputDico << "..." << std::endl; - dico->read(param.inputDico); - } - wbuilderRandomTester16 = new - DictTester( param, *dico ); - if( param.listOfWords.size() > 0 ) { - std::cerr << "addListOfRandomWords " - << param.listOfWords << "..." << std::endl; - wbuilderRandomTester16->addListOfUnorderedWords(); - } - wbuilderRandomTester16->exec(); - wbuilderRandomTester16->write(); - } - - else if ( !param.spareMem) { - // Si Builder sans option addWord: Builder - std::cerr << "Create dictionary...." << std::endl; - Lima::Common::FsaAccess::FsaAccessBuilder16 *dico=0; - if(param.trieDirectionForward) { - dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(); - } - else { - dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(false); - } - if( param.inputDico.size() > 0) { - std::cerr << "no read operation allowed for FsaAccessBuilder " - << std::endl; - return EXIT_FAILURE; - } - - wbuilderTester16 = new - DictTester( param, *dico ); - if( param.listOfWords.size() > 0 ) { - std::cerr << "addListOfWords " - << param.listOfWords << "..." << std::endl; - wbuilderTester16->addListOfWords(); - } - wbuilderTester16->exec(); - wbuilderTester16->write(); - } - else { - int refSize = 1; - int memSize = 0; - int memSize0 = 0; - if( param.runPerfo ) { - refSize = logFileSize( param.listOfWords ); - logFileSize( param.inputDico ); - memSize0 = getProcStat( std::string("VmSize") ); - std::cout << "procSize before load dico = " << memSize0 << std::endl; - TimeUtils::updateCurrentTime(); - } - Lima::Common::FsaAccess::FsaAccessSpare16 *dico = - new Lima::Common::FsaAccess::FsaAccessSpare16(); - dico->read(param.inputDico); - if( param.runPerfo ) { - TimeUtils::logElapsedTime("load dico"); - memSize = getProcStat( std::string("VmSize") ); - std::cout << "procSize after load dico = " << memSize << std::endl; - std::cout << "dico size in mem = " << memSize - memSize0 << std::endl; - std::cout << "compression rate = " << ((memSize - memSize0)*102400.0)/refSize << "%" << std::endl; - } - if( param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - dico->printGraph(std::cerr); - } - wspareTester16 = new - DictTester( - param, *dico ); - - -/* - Lima::LimaString lcwlem0(Misc::utf8stdstring2limastring("b")); - std::cerr << "lcwlem0=" << lcwlem0 << std::endl; - Lima::LimaString & stlem0 = lcwlem0; - Lima::LimaString & stlem1 = lcwlem1; - Lima::LimaString & stlem2 = lcwlem2; - Lima::LimaString & stlem3 = lcwlem3; - Lima::LimaString & stlem4 = lcwlem4; - Lima::LimaString & stlem5 = lcwlem5; - Lima::LimaString & stlem6 = lcwlem6; - Lima::LimaString & stlem7 = lcwlem7; -*/ - if( param.runIndex ) { - std::cerr << "runIndex" << std::endl; - std::vector listOfWords; - std::vector indexes; - - if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - } - else { - Lima::LimaString lcwlem1(Misc::utf8stdstring2limastring("béc")); - Lima::LimaString lcwlem2(Misc::utf8stdstring2limastring("séc")); - Lima::LimaString lcwlem3(Misc::utf8stdstring2limastring("sél")); - Lima::LimaString lcwlem4(Misc::utf8stdstring2limastring("sé")); - Lima::LimaString lcwlem5(Misc::utf8stdstring2limastring("s")); - Lima::LimaString lcwlem6(Misc::utf8stdstring2limastring("truc")); - Lima::LimaString lcwlem7(Misc::utf8stdstring2limastring("table")); - listOfWords.push_back( Lima::LimaString(lcwlem1) ); - indexes.push_back(1); - listOfWords.push_back( Lima::LimaString(lcwlem2) ); - indexes.push_back(2); - listOfWords.push_back( Lima::LimaString(lcwlem3) ); - indexes.push_back(3); - listOfWords.push_back( Lima::LimaString(lcwlem4) ); - indexes.push_back(4); - listOfWords.push_back( Lima::LimaString(lcwlem5) ); - indexes.push_back(-1); - listOfWords.push_back( Lima::LimaString(lcwlem6) ); - indexes.push_back(-1); - listOfWords.push_back( Lima::LimaString(lcwlem7) ); - indexes.push_back(-1); - }; - std::cerr << "testIndex" << std::endl; -// for( int i = 10 ; i > 0 ; i-- ) - TimeUtils::updateCurrentTime(); - wspareTester16->testIndex(listOfWords.begin(), listOfWords.end(), indexes ); - uint64_t elapsed = TimeUtils::elapsedTime(); - TimeUtils::logElapsedTime("testIndex"); - std::cout << "key average size = " << (refSize*1.0)/dico->getSize() << " byte" << std::endl; - std::cout << "testIndex: average time = " << (elapsed*1000.0)/dico->getSize() << std::endl; - } - - if( param.runSpelling ) { - std::vector listOfWords; - std::vector indexes; - - // case 1: ask for spelling of a word given a termId - if( param.termId > 0 ) { - indexes.push_back(param.termId); - std::cerr << "testSpelling with unique termId " << indexes[0] << std::endl; - } - // case 2: check if getSpelling is ok for every id - // (listOfWords is supposed to contain the complete ordered list of terms - else if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - int index = 1; - for( std::vector::const_iterator it = listOfWords.begin() ; - it != listOfWords.end() ; it++ ) { - indexes.push_back(index++); - std::cerr << "testSpelling with list of " << indexes.size() << " words" << std::endl; - } - } - wspareTester16->testSpelling(listOfWords.begin(), listOfWords.end(), indexes ); - } - if( param.superWord) { - std::cerr << "runSuper" << std::endl; - std::vector listOfWords; - Lima::LimaString vide; - listOfWords.push_back(vide); - - if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - } - wspareTester16->testSuper(listOfWords.begin(), listOfWords.end()); - } - - wspareTester16->exec(); - if( param.subWord) { - // cha�e �d�ouper - std::vector hyperwords; - // offset de localisation de l'hypermot dans la cha�e - std::vector offsets; - // r�onses du dictionnaire sur l'appel �getSubword - std::vector > subwords; - if( param.listOfHyperwords.size() > 0 ) { - std::ifstream Hlist(param.listOfHyperwords.c_str(), std::ios::in | std::ios::binary ); - if ( !Hlist.is_open() ) { - std::cerr << "Cannot open list of (hyperword,offset..) " << param.listOfHyperwords << std::endl; - return EXIT_FAILURE; - } - std::cerr << "Read hyperword and offset...." << std::endl; - std::string line; - - for( int counter = 0 ; ; counter++ ) { - // lecture d'une ligne du fichier de test - line = Lima::Common::Misc::readLine(Hlist); - if( line.size() == 0 ) { - std::cerr << "end of list of (hyperword,offset)." << std::endl; - break; - } - else { - // extraction chaine a decouper - std::string::size_type hyperword_pos = line.find(';'); - std::string utf8_hyperword(line, 0, hyperword_pos); - Lima::LimaString hyperword = Lima::Common::Misc::utf8stdstring2limastring(utf8_hyperword); - hyperwords.push_back(hyperword); - std::cerr << "push(" << hyperword; -// std::cerr << "offset=" << hyperword_pos << std::endl; - // extraction offset - std::string::size_type offset_pos = line.find(';', hyperword_pos+1); - std::string offset_str(line, hyperword_pos+1, offset_pos-(hyperword_pos+1)); - int offset = std::atoi(offset_str.c_str()); - offsets.push_back(offset); - std::cerr << "," << offset; -// std::cerr << "offset=" << offset_pos << std::endl; - // extraction liste de r�onses attendues - std::vector answers; - std::string::size_type subword_pos0 = offset_pos; - std::string::size_type subword_pos = line.find(';', subword_pos0+1); - for( ; subword_pos != std::string::npos ; subword_pos = line.find(';', subword_pos0+1) ) { - std::string utf8_answer(line, subword_pos0+1, subword_pos-(subword_pos0+1)); - Lima::LimaString answer = Lima::Common::Misc::utf8stdstring2limastring(utf8_answer); - answers.push_back(answer); - std::cerr << "," << answer; -// std::cerr << "offset=" << subword_pos << std::endl; - subword_pos0 = subword_pos; - } - subwords.push_back(answers); - std::cerr << ")" << std::endl; - } - } - } - else { - Lima::LimaString lcwhyper1(Misc::utf8stdstring2limastring("séc")); - Lima::LimaString lcwhyper2(Misc::utf8stdstring2limastring("abcséc")); - Lima::LimaString lcwhyper3(Misc::utf8stdstring2limastring("truc")); - Lima::LimaString & stlem1 = lcwhyper1; - Lima::LimaString & stlem2 = lcwhyper2; - Lima::LimaString & stlem3 = lcwhyper3; - - hyperwords.push_back(Lima::LimaString(stlem1)); // s� - offsets.push_back(0); - hyperwords.push_back(Lima::LimaString(stlem2)); // abcs� - offsets.push_back(3); - hyperwords.push_back(Lima::LimaString(stlem3)); // truc - offsets.push_back(0); - } - wspareTester16->testSub(hyperwords, offsets, subwords, param.withAssert); - } -// wspareTester16->write(); - } - - return EXIT_SUCCESS; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + testFsaDict16.cpp - description + ------------------- + begin : lun jun 2 2003 + copyright : (C) 2003 by Olivier Mesnard + email : olivier.mesnard@cea.fr +// ***************************************************************************/ + +/*************************************************************************** + * * + * compact dictionnary based on finite state automata * + * implemented with Boost Graph library * + * * + ***************************************************************************/ +#include "common/LimaCommon.h" + +#include "common/time/traceUtils.h" + +// string and file handling Utilities +#include "common/Data/strwstrtools.h" + +// dictionaries +#include "common/FsaAccess/FsaAccessBuilder16.h" +#include "common/FsaAccess/FsaAccessBuilderRandom16.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/misc/AbstractAccessByString.h" + +#include + +// for set locale +#include +// for system() +#include + +#include +#include +#include +#include +#include +#include +#include + +// For ::stat() function +#include +#include +#ifndef WIN32 +#include +#endif +#ifdef ANTINNO_SPECIFIC +// FWI 18/02/2014 : ajout 2 undef +#ifdef WIN32 +#undef max +#undef min +#endif +#endif + +using namespace std; +using namespace Lima; +using namespace Lima::Common; + +int logFileSize( const std::string& filename ) { + struct stat sts; + if( stat( filename.c_str(), &sts) != 0) + std::cerr << "logFileSize: error getting info for file " << filename << std::endl; + std::cout << "taille fichier: " << filename << "= " << sts.st_size << std::endl; + return sts.st_size; +} + +void logMemsize( const string& legend ) { +#ifdef WIN32 + LIMA_UNUSED(legend); +#else + pid_t pid = getpid(); + ostringstream ostr; + ostr << "/proc/" << pid << "/status"; + ifstream statusFile(ostr.str().c_str(), std::ifstream::binary); + char strbuff[200]; + for( ; ; ) { + string status; + statusFile.getline(strbuff, 200, '\n' ); + string line(strbuff); + if(line.empty() ) + break; + string::size_type composed1_pos = line.find("VmSize:"); + if( composed1_pos != string::npos ) { + string vmSizeStr(line, composed1_pos+7); + int vmSize = atoi(vmSizeStr.c_str()); + std::cerr << legend << " VmSize:" << vmSize; + } + } +#endif +} + +int getProcStat( const std::string& toLog ) { +#ifdef WIN32 + LIMA_UNUSED(toLog); + return 0; +#else + std::string statusFile; + + ostringstream os; + os << "/proc/" << getpid() << "/status"; + statusFile=os.str(); + + ifstream statusIn(statusFile.c_str(),ios::in | std::ifstream::binary); + string line; + int val; + while (!statusIn.eof()) + { + getline(statusIn,line); +// std::cout << "line = " << line << std::endl; + size_t index=line.find(toLog); + if( index != std::string::npos ) { +// std::cout << "index = " << index << std::endl; + string valstr=line.substr(index+toLog.size()+1); +// std::cout << "valstr = " << valstr << std::endl; + val = atoi(valstr.c_str()); + std::cout << toLog << "=" << val < & listOfWords ) + { + std::ifstream wList(listOfWordsFilename.c_str(), std::ios::in | std::ios::binary ); + if ( !wList.is_open() ) { + std::cerr << "Cannot open list of words " << listOfWordsFilename << std::endl; + return EXIT_FAILURE; + } + std::cerr << "Read list of words" << std::endl; + char strbuff[200]; + + for( int counter = 0 ; ; counter++ ) { + // lecture d'une ligne du fichier + wList.getline(strbuff, 200, '\n' ); + string line(strbuff); + if( line.size() == 0 ) { + std::cerr << "end of list of words. counter=" << counter << std::endl; + break; + } + else { + // extraction cha�e + Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); + listOfWords.push_back(word); + } + } + return EXIT_SUCCESS; +} + + +template +class DictTester { + public: + DictTester(Param param, dictType &dico) : m_param(param), m_dico(dico) { + } + void exec( void ); + void testSub(std::vector& hyperwords, + std::vector& offsets, + std::vector > &subwords, bool withAssert ); + void testSuper(typename std::vector::const_iterator begin, + typename std::vector::const_iterator end ); + void testIndex( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ); + void testSpelling( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ); + void addListOfWords(); + void addListOfUnorderedWords(); + void write( void ); + private: + Param m_param; + dictType &m_dico; +}; + + +template +void DictTester::addListOfWords() { + + if( !m_param.listOfWords.size() ) + return; + + std::ifstream wList(m_param.listOfWords.c_str(), std::ios::in | std::ios::binary ); + if ( !wList.is_open() ) { + std::cerr << "Cannot open list of words " << m_param.listOfWords << std::endl; + return; + } + std::cerr << "Read list of words" << std::endl; + char strbuff[200]; + + for( int counter = 0 ; ; counter++ ) { + if( (counter%10000) == 0 ) { + ostringstream ostr; + ostr << "\naddListOfWords counter = " << counter; +// std::cerr << "addListOfWords counter = " << counter << std::endl; + logMemsize( ostr.str() ); + } + // lecture d'une ligne du fichier + wList.getline(strbuff, 200, '\n' ); + string line(strbuff); + if( wList.eof() ) + { + std::cerr << "end of list of words. counter=" << counter << std::endl; + break; + } + else if (!line.empty()) + { +// std::cerr << "addListOfWords: (" << line << ")" << std::endl; + Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); +// std::cerr << "addListOfWords: addWord(" << word << ")" << std::endl; + m_dico.addWord( word ); + } + } + std::cerr << std::endl; + m_dico.pack(); +} + +template +void DictTester::addListOfUnorderedWords() { + + if( m_param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + m_dico.printGraph(std::cerr); + } + + if( !m_param.listOfWords.compare(std::string("")) ) + return; + + std::vector listOfWords; + readListOfWords(m_param.listOfWords, listOfWords); + + int counter(0); + for( std::vector::iterator itWord = listOfWords.begin() ; + itWord != listOfWords.end() ; itWord++, counter++ ) { +// if( (counter%10000) == 0 ) { + std::cerr << "addListOfWords(" << *itWord << "), counter = " << counter << std::endl; +// } + m_dico.addRandomWord( *itWord ); + } + + if( m_param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + m_dico.printGraph(std::cerr); + } +// m_dico.pack(); +} + +template +void DictTester::testIndex( + typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ) { + std::cout << "testIndex: getSize() = " << m_dico.getSize() << std::endl; + + std::vector::const_iterator indexItr = indexes.begin(); + int index0 = 0; + + for( typename std::vector::const_iterator lemma = begin ; + lemma != end ; lemma++ ) { + // recup�ation de l'index �partir de la cha�e de caract�es + int index = m_dico.getIndex(*lemma); + // traces + if( index%10000 == 0 ) { + ostringstream ostr; + ostr << "testIndex index = " << index; +// std::cerr << "addListOfWords counter = " << counter << std::endl; + logMemsize( ostr.str() ); + } + if( m_param.withDebug ) { + Lima::LimaString newWord = *lemma; + std::cout << "testIndex: getIndex(" + << Lima::Common::Misc::limastring2utf8stdstring(newWord) + << ")=" << index << std::endl; + } + else { + if( index%10000 == 1 ) { + Lima::LimaString newWord = *lemma; + std::cout << "testIndex: getIndex(" << Lima::Common::Misc::limastring2utf8stdstring(newWord) + << ")=" << index << std::endl; + } + } + // result verification + if( m_param.withAssert ) { + if( indexItr != indexes.end() ) { +// std::cerr << "check " << index << "!=" << *indexItr << std::endl; + assert( index == *indexItr); + indexItr++; + } + else { +// std::cerr << "check " << index << "!=" << index0+1 << std::endl; + assert( index == index0+1 ); + index0 = index; + } + } + } + + // test sur chaine n'existant pas + for( typename std::vector::const_iterator lemma = begin ; + lemma != end ; lemma++ ) { + int index = m_dico.getIndex(*lemma); + Lima::LimaString invertedLemma; + for( int i = (*lemma).size()-1; i >= 0 ; i-- ) { + invertedLemma.push_back((*lemma)[i]); + } + int invertedIndex = m_dico.getIndex(invertedLemma); + // traces + if( index%10000 == 0 ) { + ostringstream ostr; + ostr << "testIndex inverted (" + << Lima::Common::Misc::limastring2utf8stdstring(invertedLemma) + << ") index = " << invertedIndex; + logMemsize( ostr.str() ); + } + } +} + +template + void DictTester::testSpelling( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ) +//void DictTester::testSpelling( int *indexVal, int nbIndex ) +{ + LIMA_UNUSED(end); + typename std::vector::const_iterator lemma = begin; + + // if size of indexes = 1, we just display the string return by getSpelling() + std::cout << "testSpelling: getSpelling: indexes.size()=" << indexes.size() << std::endl; + if( indexes.size() == 1 ) { + Lima::LimaString spelling; + spelling = m_dico.getSpelling(indexes[0]); + std::cout << "testSpelling: getSpelling(" << indexes[0] + << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; + } + // for each id, compare result of getSpelling with element in vector of string [begin,end] + for( uint32_t i = 0 ; i < indexes.size() ; i++ ) { + Lima::LimaString spelling; + try{ + spelling = m_dico.getSpelling(indexes[i]); + if( i%10000 == 1 ) { + std::cout << "testSpelling: getSpelling(" << indexes[i] + << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; + } + if( m_param.withAssert ) { + assert( spelling == (*lemma) ); + } + } + catch(std::logic_error e ) { + std::cout << "testSpelling exception: " << e.what() << std::endl; + } + lemma++; + } +} + +template +void DictTester::testSuper( + typename std::vector::const_iterator begin, + typename std::vector::const_iterator end ) { + + for( typename std::vector::const_iterator it = begin ; + it != end ; it++ ) { + try{ + Lima::LimaString prefix = *it; + std::pair entries = + m_dico.getSuperWords(prefix); + std::cout << "testSuper: getSuperWords(" + << Lima::Common::Misc::limastring2utf8stdstring(prefix) + << ")" << std::endl; + for( ; entries.first != entries.second ; entries.first++ ) { + Lima::LimaString superWord = *(entries.first); + std::cout << Lima::Common::Misc::limastring2utf8stdstring(superWord) + << ", " << std::endl; + } + std::cout << std::endl; + } + catch(std::logic_error e ) { + std::cout << "testSuper: getSuperWords exception: " << e.what() << std::endl; + } + } +} + +template + void DictTester::testSub( + std::vector & hyperwords, + std::vector & offsets, + std::vector >& subwords, bool withAssert ) { + + typename std::vector::iterator wordIt; + std::vector::iterator offsetIt = offsets.begin(); + typename std::vector >::iterator answersIt = subwords.begin(); + for( wordIt = hyperwords.begin(); wordIt != hyperwords.end() ; wordIt++ ) { + try{ + Lima::LimaString word = *wordIt; + std::pair entries = m_dico.getSubWords(*offsetIt,word); + FSAALOGINIT; + LDEBUG << "test getSubWords(" + << ", " << word << ")" ; + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + LINFO << "string(" << *offsetIt << "," << (*entry).first << "), "; + } + LINFO ; + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); + LINFO << subWord << ", "; + } + LINFO ; + if( withAssert ) { + // r�up�ation des r�onses attendues pour v�ifications + assert( answersIt != subwords.end() ); + std::vector answers = *(answersIt++); + typename std::vector::iterator answerIt = answers.begin(); + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + assert( answerIt != answers.end() ); + Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); + assert(!subWord.compare(*answerIt)); + answerIt++; + } + } + } + catch(std::logic_error e ) { + std::cout << "testSub: getSubWords exception: " << e.what() << std::endl; + } + offsetIt++; + } +} + +template +void DictTester::exec( void ) { + if( m_param.withDebug ) { + std::cerr << "Print dictionary...." << std::endl; + m_dico.print(std::cout); + } +} + +template +void DictTester::write( void ) { + try { + if( m_param.outputDico.size() > 0 ) { + std::cerr << "Write dictionary...." << std::endl; + m_dico.write(m_param.outputDico); + } + } + catch(LimaException e ) { + std::cout << "write: exception: " << e.what() << std::endl; + } +} + +int main(int argc, char *argv[]) +{ + QCoreApplication a(argc, argv); +#ifdef ANTINNO_SPECIFIC + { + ::std::string const configDir = ::std::getenv("AMOSE_CONF"); + if (configDir.empty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + QsLogging::antinno::log = pLog1; + } +#else + QsLogging::initQsLog(); +#endif + + cerr << argv[0] << " begin..." << endl << " command line: "; + for (int i = 0; i < argc; i++) + { + std::cerr << argv[i] << " "; + } + std::cerr << std::endl; + + setlocale(LC_ALL, ""); +#ifdef DEBUG_CD + FSAALOGINIT; + LDEBUG << argv[0] << " begin..." ; +#endif + + // options reading + Param param = { + std::string(), // listOfWords + std::string(), // outputDico + std::string(), // inputDico + false, // subWord + std::string(), // listOfHyperwords + false, // superWord + false, // printGraph + false, // spareMem + one_byte, // charSize + false, // withoutTemplate + true, // trieDirectionForward + false, // withDebug + false, // runPerfo + false, // runIndex + false, // addWord + false, // runSpelling + -1, // termId (-1 means no termId specified by user) + false, // composed + false, // withAssert + std::string() // inputDico + }; + + for (int i = 1 ; i < argc; i++) { + QString arg = QString::fromUtf8(argv[i]); + int pos = -1; + if (arg == "--help") + { + std::cerr << "usage: " << argv[0] + << " --help" << std::endl; + std::cerr << " " << argv[0] + << " [--output=]" + << " [--input=]" + << " [--printGraph]" + << " [--subWord]" + << " [--listOfHyperwords=]" + << " [--listOfWords=]" + << " [--superWord]" + << " [--spare]" + << " [--runIndex]" + << " [--addWord]" + << " [--runSpelling]" + << " [--termId=nn" + << " [--composed=]" + << " [--charSize=<1|2|4>]" + << " [--withoutTemplate" + << " [--reverse]" + << " [--withDebug]" + << " [--runPerfo]" + << " [--withAssert]" + << std::endl; + return 0; + } + else if ( (pos = arg.indexOf("--input=")) != -1 ){ + param.inputDico = arg.mid(pos+8).toUtf8().data(); + } + else if ( (pos = arg.indexOf("--output=")) != -1 ){ + param.outputDico = arg.mid(pos+9).toUtf8().data(); + } + else if ( arg =="--printGraph" ){ + param.printGraph = true; + } + else if ( arg == "--subWord" ){ + param.subWord = true; + } + else if ( (pos = arg.indexOf("--listOfHyperwords=")) != -1 ){ + param.listOfHyperwords = arg.mid(pos+19).toUtf8().data(); + } + else if ( (pos = arg.indexOf("--listOfWords=")) != -1 ){ + param.listOfWords = arg.mid(pos+14).toUtf8().data(); + } + else if ( arg == "--superWord" ){ + param.superWord = true; + } + else if ( arg == "--withDebug" ){ + param.withDebug = true; + } + else if ( arg == "--runPerfo" ){ + param.runPerfo = true; + } + else if ( arg == "--withoutTemplate" ){ + param.withoutTemplate = true; + } + else if ( (pos = arg.indexOf("--charSize=")) != -1 ){ + int charSize = (arg.mid(pos+11)).toInt(); + switch(charSize) { + case 1: + param.charSize = one_byte; + break; + case 2: + param.charSize = two_bytes; + break; + case 4: + param.charSize = four_bytes; + break; + } + } + else if ( arg == "--spare" ){ + param.spareMem = true; + } + else if ( arg == "--runIndex" ){ + param.runIndex = true; + } + else if ( arg == "--addWord" ){ + param.addWord = true; + } + else if ( arg == "--runSpelling" ){ + param.runSpelling = true; + } + else if ( (pos = arg.indexOf("--termId=")) != -1 ){ + param.termId = (arg.mid(pos+9)).toInt(); + } + else if ( arg == "--reverse" ){ + param.trieDirectionForward = false; + } + else if ( (pos = arg.indexOf("--composed=")) != -1 ){ + param.composed = true; + param.inputDicoComp = arg.mid(pos+12).toUtf8().data(); + } + else if ( arg == "--withAssert" ){ + param.withAssert = true; + } + } + + cerr << argv[0] << ": "; + if(param.withDebug) + cerr << "--withDebug "; + if(param.runPerfo) + cerr << "--runPerfo "; + if(param.spareMem) + cerr << "--spare "; + if(param.runIndex) + cerr << "--runIndex "; + if(param.addWord) + cerr << "--addWord "; + if(param.runSpelling) + cerr << "--runSpelling "; + if(param.printGraph) + cerr << "--printGraph "; + if(!param.trieDirectionForward) + cerr << "--reverse "; + if(!param.withoutTemplate) + cerr << "--withoutTemplate "; + if(param.subWord) { + cerr << "--subWord "; + if(param.listOfHyperwords.size()){ + cerr << "--listOfHyperwords=" << param.listOfHyperwords << " "; + } + } + if(param.composed) + cerr << "--composed=" << param.inputDicoComp << " "; + cerr << "--charSize=" << param.charSize; + if(param.inputDico.size()) { + cerr << "--input='" << param.inputDico << "' "; + } + if(param.outputDico.size()) { + cerr << "--output='" << param.outputDico << "' "; + } + if(param.listOfWords.size()) { + cerr << "--listOfWords='" << param.listOfWords << "'"; + } + cerr << endl; + + DictTester *wspareTester16=0; + DictTester *wbuilderTester16=0; + DictTester *wbuilderRandomTester16=0; + + if( (!param.spareMem) && (param.addWord) ) { + // Si Builder avec option addWord: BuilderRandom + std::cerr << "Create BuilderRandom dictionary...." << std::endl; + Lima::Common::FsaAccess::FsaAccessBuilderRandom16 *dico=0; + if(param.trieDirectionForward) { + dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(); + } + else { + dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(false); + } + if( param.inputDico.size() > 0) { + std::cerr << "Read dictionary from file... " + << param.inputDico << "..." << std::endl; + dico->read(param.inputDico); + } + wbuilderRandomTester16 = new + DictTester( param, *dico ); + if( param.listOfWords.size() > 0 ) { + std::cerr << "addListOfRandomWords " + << param.listOfWords << "..." << std::endl; + wbuilderRandomTester16->addListOfUnorderedWords(); + } + wbuilderRandomTester16->exec(); + wbuilderRandomTester16->write(); + } + + else if ( !param.spareMem) { + // Si Builder sans option addWord: Builder + std::cerr << "Create dictionary...." << std::endl; + Lima::Common::FsaAccess::FsaAccessBuilder16 *dico=0; + if(param.trieDirectionForward) { + dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(); + } + else { + dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(false); + } + if( param.inputDico.size() > 0) { + std::cerr << "no read operation allowed for FsaAccessBuilder " + << std::endl; + return EXIT_FAILURE; + } + + wbuilderTester16 = new + DictTester( param, *dico ); + if( param.listOfWords.size() > 0 ) { + std::cerr << "addListOfWords " + << param.listOfWords << "..." << std::endl; + wbuilderTester16->addListOfWords(); + } + wbuilderTester16->exec(); + wbuilderTester16->write(); + } + else { + int refSize = 1; + int memSize = 0; + int memSize0 = 0; + if( param.runPerfo ) { + refSize = logFileSize( param.listOfWords ); + logFileSize( param.inputDico ); + memSize0 = getProcStat( std::string("VmSize") ); + std::cout << "procSize before load dico = " << memSize0 << std::endl; + TimeUtils::updateCurrentTime(); + } + Lima::Common::FsaAccess::FsaAccessSpare16 *dico = + new Lima::Common::FsaAccess::FsaAccessSpare16(); + dico->read(param.inputDico); + if( param.runPerfo ) { + TimeUtils::logElapsedTime("load dico"); + memSize = getProcStat( std::string("VmSize") ); + std::cout << "procSize after load dico = " << memSize << std::endl; + std::cout << "dico size in mem = " << memSize - memSize0 << std::endl; + std::cout << "compression rate = " << ((memSize - memSize0)*102400.0)/refSize << "%" << std::endl; + } + if( param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + dico->printGraph(std::cerr); + } + wspareTester16 = new + DictTester( + param, *dico ); + + +/* + Lima::LimaString lcwlem0(Misc::utf8stdstring2limastring("b")); + std::cerr << "lcwlem0=" << lcwlem0 << std::endl; + Lima::LimaString & stlem0 = lcwlem0; + Lima::LimaString & stlem1 = lcwlem1; + Lima::LimaString & stlem2 = lcwlem2; + Lima::LimaString & stlem3 = lcwlem3; + Lima::LimaString & stlem4 = lcwlem4; + Lima::LimaString & stlem5 = lcwlem5; + Lima::LimaString & stlem6 = lcwlem6; + Lima::LimaString & stlem7 = lcwlem7; +*/ + if( param.runIndex ) { + std::cerr << "runIndex" << std::endl; + std::vector listOfWords; + std::vector indexes; + + if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + } + else { + Lima::LimaString lcwlem1(Misc::utf8stdstring2limastring("béc")); + Lima::LimaString lcwlem2(Misc::utf8stdstring2limastring("séc")); + Lima::LimaString lcwlem3(Misc::utf8stdstring2limastring("sél")); + Lima::LimaString lcwlem4(Misc::utf8stdstring2limastring("sé")); + Lima::LimaString lcwlem5(Misc::utf8stdstring2limastring("s")); + Lima::LimaString lcwlem6(Misc::utf8stdstring2limastring("truc")); + Lima::LimaString lcwlem7(Misc::utf8stdstring2limastring("table")); + listOfWords.push_back( Lima::LimaString(lcwlem1) ); + indexes.push_back(1); + listOfWords.push_back( Lima::LimaString(lcwlem2) ); + indexes.push_back(2); + listOfWords.push_back( Lima::LimaString(lcwlem3) ); + indexes.push_back(3); + listOfWords.push_back( Lima::LimaString(lcwlem4) ); + indexes.push_back(4); + listOfWords.push_back( Lima::LimaString(lcwlem5) ); + indexes.push_back(-1); + listOfWords.push_back( Lima::LimaString(lcwlem6) ); + indexes.push_back(-1); + listOfWords.push_back( Lima::LimaString(lcwlem7) ); + indexes.push_back(-1); + }; + std::cerr << "testIndex" << std::endl; +// for( int i = 10 ; i > 0 ; i-- ) + TimeUtils::updateCurrentTime(); + wspareTester16->testIndex(listOfWords.begin(), listOfWords.end(), indexes ); + uint64_t elapsed = TimeUtils::elapsedTime(); + TimeUtils::logElapsedTime("testIndex"); + std::cout << "key average size = " << (refSize*1.0)/dico->getSize() << " byte" << std::endl; + std::cout << "testIndex: average time = " << (elapsed*1000.0)/dico->getSize() << std::endl; + } + + if( param.runSpelling ) { + std::vector listOfWords; + std::vector indexes; + + // case 1: ask for spelling of a word given a termId + if( param.termId > 0 ) { + indexes.push_back(param.termId); + std::cerr << "testSpelling with unique termId " << indexes[0] << std::endl; + } + // case 2: check if getSpelling is ok for every id + // (listOfWords is supposed to contain the complete ordered list of terms + else if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + int index = 1; + for( std::vector::const_iterator it = listOfWords.begin() ; + it != listOfWords.end() ; it++ ) { + indexes.push_back(index++); + std::cerr << "testSpelling with list of " << indexes.size() << " words" << std::endl; + } + } + wspareTester16->testSpelling(listOfWords.begin(), listOfWords.end(), indexes ); + } + if( param.superWord) { + std::cerr << "runSuper" << std::endl; + std::vector listOfWords; + Lima::LimaString vide; + listOfWords.push_back(vide); + + if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + } + wspareTester16->testSuper(listOfWords.begin(), listOfWords.end()); + } + + wspareTester16->exec(); + if( param.subWord) { + // cha�e �d�ouper + std::vector hyperwords; + // offset de localisation de l'hypermot dans la cha�e + std::vector offsets; + // r�onses du dictionnaire sur l'appel �getSubword + std::vector > subwords; + if( param.listOfHyperwords.size() > 0 ) { + std::ifstream Hlist(param.listOfHyperwords.c_str(), std::ios::in | std::ios::binary ); + if ( !Hlist.is_open() ) { + std::cerr << "Cannot open list of (hyperword,offset..) " << param.listOfHyperwords << std::endl; + return EXIT_FAILURE; + } + std::cerr << "Read hyperword and offset...." << std::endl; + std::string line; + + for( int counter = 0 ; ; counter++ ) { + // lecture d'une ligne du fichier de test + line = Lima::Common::Misc::readLine(Hlist); + if( line.size() == 0 ) { + std::cerr << "end of list of (hyperword,offset)." << std::endl; + break; + } + else { + // extraction chaine a decouper + std::string::size_type hyperword_pos = line.find(';'); + std::string utf8_hyperword(line, 0, hyperword_pos); + Lima::LimaString hyperword = Lima::Common::Misc::utf8stdstring2limastring(utf8_hyperword); + hyperwords.push_back(hyperword); + std::cerr << "push(" << hyperword; +// std::cerr << "offset=" << hyperword_pos << std::endl; + // extraction offset + std::string::size_type offset_pos = line.find(';', hyperword_pos+1); + std::string offset_str(line, hyperword_pos+1, offset_pos-(hyperword_pos+1)); + int offset = std::atoi(offset_str.c_str()); + offsets.push_back(offset); + std::cerr << "," << offset; +// std::cerr << "offset=" << offset_pos << std::endl; + // extraction liste de r�onses attendues + std::vector answers; + std::string::size_type subword_pos0 = offset_pos; + std::string::size_type subword_pos = line.find(';', subword_pos0+1); + for( ; subword_pos != std::string::npos ; subword_pos = line.find(';', subword_pos0+1) ) { + std::string utf8_answer(line, subword_pos0+1, subword_pos-(subword_pos0+1)); + Lima::LimaString answer = Lima::Common::Misc::utf8stdstring2limastring(utf8_answer); + answers.push_back(answer); + std::cerr << "," << answer; +// std::cerr << "offset=" << subword_pos << std::endl; + subword_pos0 = subword_pos; + } + subwords.push_back(answers); + std::cerr << ")" << std::endl; + } + } + } + else { + Lima::LimaString lcwhyper1(Misc::utf8stdstring2limastring("séc")); + Lima::LimaString lcwhyper2(Misc::utf8stdstring2limastring("abcséc")); + Lima::LimaString lcwhyper3(Misc::utf8stdstring2limastring("truc")); + Lima::LimaString & stlem1 = lcwhyper1; + Lima::LimaString & stlem2 = lcwhyper2; + Lima::LimaString & stlem3 = lcwhyper3; + + hyperwords.push_back(Lima::LimaString(stlem1)); // s� + offsets.push_back(0); + hyperwords.push_back(Lima::LimaString(stlem2)); // abcs� + offsets.push_back(3); + hyperwords.push_back(Lima::LimaString(stlem3)); // truc + offsets.push_back(0); + } + wspareTester16->testSub(hyperwords, offsets, subwords, param.withAssert); + } +// wspareTester16->write(); + } + + return EXIT_SUCCESS; +} + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h index f70ffd2c8..2e0444d20 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h @@ -88,7 +88,11 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT AbstractLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const = 0; + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + , Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) const = 0; /** * This function is the same as the previous one but takes a text @@ -99,7 +103,11 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT AbstractLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const = 0; + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + ,Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) const = 0; }; /** @@ -141,7 +149,7 @@ class AbstractLinguisticProcessingClientFactory : public RegistrableFactory createClient() const = 0; /** * virtual destructor of the LinguisticProcessing client factory diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp index dd57a242b..ff2586b8d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp @@ -83,7 +83,7 @@ void BowDocumentHandler::endDocument() // read Part( istream, AbstractBoWXMLWriter writer, bool useIterator) // do not use iterator, // std::cout is unused - reader.readBoWDocumentBlock(in, *document, structuredBowHandler, false); + reader.readBoWDocumentBlock(in, *document, structuredBowHandler, false, false); } delete m_bowstream; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp new file mode 100644 index 000000000..5b8b18557 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp @@ -0,0 +1,87 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ + +#include "LTRTextHandler.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +using namespace std; +using namespace Lima::Common::BagOfWords; + +namespace Lima { +namespace LinguisticProcessing { + +LTRTextHandler::LTRTextHandler() + : AbstractTextualAnalysisHandler(),m_ltrstream(),m_ltrtext() +{ +} + + +LTRTextHandler::~LTRTextHandler() +{ +} + +Common::BagOfWords::LTR_Text& LTRTextHandler::getLTRText() +{ + return m_ltrtext; +} + + +void LTRTextHandler::endAnalysis() +{ + // read from completed stream + m_ltrtext.binaryReadFrom(m_ltrstream); +} + + +void LTRTextHandler::startAnalysis() +{ + m_ltrtext.clear(); + // reset stringstream + m_ltrstream.str(""); +} + +void LTRTextHandler::handle(const char* buf, int length) +{ + // store in stream + m_ltrstream.write(buf,length); + //m_writer->handle(buf,length); +} + +void LTRTextHandler::endDocument() +{ +} + +void LTRTextHandler::startDocument(const Common::Misc::GenericDocumentProperties&) +{ +} + +void LTRTextHandler::startNode( const std::string& /*elementName*/, bool /*forIndexing*/ ) +{ +} + +void LTRTextHandler::endNode(const Common::Misc::GenericDocumentProperties& /*props*/) +{ +} + +} // end namespace +} // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h new file mode 100644 index 000000000..26cb3981b --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h @@ -0,0 +1,68 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSINGLTRTEXTHANDLER_H +#define LIMA_LINGUISTICPROCESSINGLTRTEXTHANDLER_H + +#include "AnalysisHandlersExport.h" + +#include "linguisticProcessing/client/AnalysisHandlers/AbstractTextualAnalysisHandler.h" +#include "linguisticProcessing/common/linearTextRepresentation/ltrText.h" +#include "common/Data/DataTypes.h" + +namespace Lima { + +namespace LinguisticProcessing { + +/** + * @brief LTRTextHandler is a handler for LTR text that gives access to the resulting LTRText through an accessor +*/ +class LIMA_ANALYSISHANDLERS_EXPORT LTRTextHandler : public AbstractTextualAnalysisHandler +{ +public: + LTRTextHandler(); + + virtual ~LTRTextHandler(); + + virtual void endAnalysis(); + virtual void handle(const char* buf, int length) ; + virtual void startAnalysis(); + + void startDocument(const Common::Misc::GenericDocumentProperties&); + void endDocument(); + void startNode( const std::string& elementName, bool forIndexing ); + void endNode(const Common::Misc::GenericDocumentProperties& props); + + Common::BagOfWords::LTR_Text& getLTRText(); + + virtual void setOut( std::ostream* /*out*/ ) {} + +private: + std::stringstream m_ltrstream; + Common::BagOfWords::LTR_Text m_ltrtext; +}; + +} + +} + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp index 9a216f4b6..964d8e760 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp @@ -74,7 +74,7 @@ openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* proper void StructuredBoWToBoWDocument:: processSBoWText(const BoWText* boWText, - bool /*unused useIterators*/) + bool /*unused useIterators*/, bool useIndexIterator) { if (! m_inIndexingNode.empty() && m_inIndexingNode.back() && @@ -89,7 +89,7 @@ processSBoWText(const BoWText* boWText, void StructuredBoWToBoWDocument:: processProperties(const Common::Misc::GenericDocumentProperties* properties, - bool /*unused useIterators*/) + bool /*unused useIterators*/, bool /*useIndexIterator*/) { if (m_inIndexingNode.back()) { addProperties(*m_currentDocument,properties); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h index 55a7bf601..32369c4d1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h @@ -55,10 +55,10 @@ class LIMA_ANALYSISHANDLERS_EXPORT StructuredBoWToBoWDocument : const std::string& elementName); void processSBoWText(const Common::BagOfWords::BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Common::Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp index 4ef8ecb3d..0f4e2d84b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp @@ -57,7 +57,7 @@ void LinguisticProcessingClientFactory::configureClientFactory( pipelines); } -AbstractProcessingClient* LinguisticProcessingClientFactory::createClient( +std::shared_ptr< AbstractProcessingClient > LinguisticProcessingClientFactory::createClient( const std::string& id) const { LPCLIENTFACTORYLOGINIT; @@ -79,7 +79,7 @@ std::deque LinguisticProcessingClientFactory::getRegisteredFactorie } -LinguisticProcessingClientFactoryFactory* LinguisticProcessingClientFactoryFactory::s_instance=new LinguisticProcessingClientFactoryFactory(); +std::unique_ptr< LinguisticProcessingClientFactoryFactory > LinguisticProcessingClientFactoryFactory::s_instance=std::unique_ptr< LinguisticProcessingClientFactoryFactory >(new LinguisticProcessingClientFactoryFactory()); } // LinguisticProcessing diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h index 7b208d994..302b95410 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h @@ -67,7 +67,7 @@ friend class Singleton; * ClientFactory must have been configured before this method is called * Use configureClientFactory() method to configure. */ - AbstractProcessingClient* createClient(const std::string& id) const; + std::shared_ptr< AbstractProcessingClient > createClient(const std::string& id) const; /** * @brief show registered clientId @@ -86,14 +86,14 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT LinguisticProcessingClientFactoryF public: ~LinguisticProcessingClientFactoryFactory(){}; - ProcessingClientFactory* createProcessingClientFactory() const + std::shared_ptr< ProcessingClientFactory > createProcessingClientFactory() const { - return new LinguisticProcessingClientFactory(); + return std::shared_ptr< ProcessingClientFactory >(new LinguisticProcessingClientFactory()); } private: LinguisticProcessingClientFactoryFactory():AbstractProcessingClientFactoryFactory("lpFactory"){}; - static LinguisticProcessingClientFactoryFactory* s_instance; + static std::unique_ptr< LinguisticProcessingClientFactoryFactory > s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h index 7854ba75b..fe452acf1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h @@ -59,8 +59,10 @@ class LIMA_BOW_EXPORT AbstractBoWDocumentHandler : public AbstractDocumentHandle const std::string& elementName) = 0; virtual void openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName) = 0; - virtual void processSBoWText(const BoWText* boWText, bool useIterators) = 0; -// virtual void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators) = 0; + virtual void processSBoWText(const BoWText* boWText, bool useIterators, + bool useIndexIterator) = 0; + virtual void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators, + bool useIndexIterator) = 0; virtual void closeSBoWNode() = 0; // virtual void writeDocumentsHeader() = 0; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h index e24388193..462d4adcf 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h @@ -50,7 +50,7 @@ namespace BagOfWords { #ifndef WIN32 enum class BoWType : unsigned short { #else -public enum class BoWType : unsigned short { +enum BoWType { #endif BOW_NOTYPE, /**< the AbstractBoWElement is an abstract one that should not be instanciated */ diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp index 3a8c511ad..412a107b1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp @@ -65,7 +65,7 @@ openSBoWIndexingNode(const Misc::GenericDocumentProperties* /*properties*/, void BinaryWriterBoWDocumentHandler:: processSBoWText(const BoWText* boWText, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { Common::Misc::writeOneByteInt(m_outputStream,Common::BagOfWords::BOW_TEXT_BLOC); m_writer.writeBoWText(m_outputStream,*boWText); @@ -73,7 +73,7 @@ processSBoWText(const BoWText* boWText, void BinaryWriterBoWDocumentHandler:: processProperties(const Misc::GenericDocumentProperties* properties, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { Common::Misc::writeOneByteInt(m_outputStream,Common::BagOfWords::DOCUMENT_PROPERTIES_BLOC); properties->write(m_outputStream); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h index 9263c2abe..f8dc906ed 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h @@ -54,9 +54,9 @@ class LIMA_BOW_EXPORT BinaryWriterBoWDocumentHandler : public AbstractBoWDocumen void openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, const std::string& elementName); void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp index f45f705f3..01075d7ed 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp @@ -29,6 +29,8 @@ #include "bowTokenIterator.h" #include "bowToken.h" #include "bowText.h" +#include "indexElementIterator.h" +#include "indexElement.h" #include "common/Data/genericDocumentProperties.h" @@ -42,6 +44,8 @@ class TextWriterBoWDocumentHandlerPrivate TextWriterBoWDocumentHandlerPrivate(std::ostream& os); ~TextWriterBoWDocumentHandlerPrivate(); + + void writeIndexElement(const IndexElement& element); std::ostream& m_outputStream; @@ -84,7 +88,7 @@ openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, void TextWriterBoWDocumentHandler:: processSBoWText(const BoWText* boWText, - bool useIterators) + bool useIterators, bool useIndexIterator) { if (useIterators) { BoWTokenIterator it(*boWText); @@ -93,6 +97,14 @@ processSBoWText(const BoWText* boWText, it++; } } + else if (useIndexIterator) { + IndexElementIterator it(*boWText); + while (! it.isAtEnd()) + { + m_d->writeIndexElement(it.getElement()); + it++; + } + } else { m_d->m_outputStream << *boWText; } @@ -100,7 +112,7 @@ processSBoWText(const BoWText* boWText, void TextWriterBoWDocumentHandler:: processProperties(const Misc::GenericDocumentProperties* /*properties*/, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { //os << *properties; } @@ -110,6 +122,54 @@ closeSBoWNode() { } +void TextWriterBoWDocumentHandlerPrivate::writeIndexElement( + const IndexElement& element) { +// m_outputStream << "" << endl; +// return; +// } +// if (element.isSimpleTerm()) { +// std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); +// +// m_outputStream << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) +// << "\" category=\"" << cat +// << "\" position=\"" << element.getPosition() +// << "\" length=\"" << element.getLength() << "\""; +// if (element.isNamedEntity()) { +// m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; +// m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; +// } +// else { +// m_outputStream << " type=\"" << BOW_TOKEN << "\""; +// } +// m_outputStream << "/>" << endl; +// return; +// } +// +// // compound +// if (element.isNamedEntity()) { +// m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; +// m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; +// } +// else { +// m_outputStream << " type=\"" << BOW_TERM << "\""; +// } +// m_outputStream << ">" << endl +// << " " << endl; +// +// for (uint64_t i(0),size=element.getStructure().size(); i" << endl; +// +// } +// m_outputStream << " " << endl +// << "" << endl; +} + } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h index fd6ba24a3..93edf3519 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h @@ -57,9 +57,9 @@ class LIMA_BOW_EXPORT TextWriterBoWDocumentHandler : public AbstractBoWDocumentH void openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, const std::string& elementName); void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index 57acc5470..ee09af42b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -137,14 +137,14 @@ void BoWBinaryReader::readHeader(std::istream& file) #ifdef DEBUG_LP BOWLOGINIT; - LDEBUG << "BoWBinaryReader::readHeader type mapping is"; + LDEBUG << "BoWBinaryReader::readHeader type mapping is (shown if logger = TRACE)"; std::ostringstream oss; for (std::map::const_iterator it=m_d->m_entityTypeMapping.begin(),it_end=m_d->m_entityTypeMapping.end(); it!=it_end; it++) { oss << (*it).first << " -> " << (*it).second << std::endl; } - LDEBUG << oss.str(); + LTRACE << oss.str(); LDEBUG << "BoWBinaryReader::readHeader end file at: " << file.tellg(); #endif @@ -193,9 +193,14 @@ void BoWBinaryReader::readBoWText(std::istream& file, void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, BoWDocument& document, AbstractBoWDocumentHandler& handler, - bool useIterator) + bool useIterator, + bool useIndexIterator) { BoWBlocType blocType = static_cast( Misc::readOneByteInt(file) ); +#ifdef ANTINNO_BUGFIX + if (file.eof()) + return; +#endif #ifdef DEBUG_LP BOWLOGINIT; LDEBUG << "BoWBinaryReader::readBoWDocumentBlock: read blocType" << blocType; @@ -230,7 +235,7 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, #endif document.clear(); readBoWText(file,document); - handler.processSBoWText(&document, useIterator); + handler.processSBoWText(&document, useIterator, useIndexIterator); break; } case NODE_PROPERTIES_BLOC: @@ -239,7 +244,7 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, LDEBUG << "NODE_PROPERTIES_BLOC"; #endif document.Misc::GenericDocumentProperties::read(file); - handler.processProperties(&document, useIterator); + handler.processProperties(&document, useIterator, useIndexIterator); break; } case END_BLOC: @@ -264,6 +269,11 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, break; } default:; +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LERROR << "MultimediaBinaryReaderIndexer::readMultimediaDocumentBlock: unmanaged block type " << blocType; +#endif +#endif } } @@ -341,6 +351,7 @@ void BoWBinaryReaderPrivate::readSimpleToken(std::istream& file, #ifdef DEBUG_LP LDEBUG << "BoWBinaryReader::readSimpleToken read infl: " << inflectedForm; #endif +#ifdef ANTINNO_SPECIFIC if (lemma.isEmpty()) { #ifdef DEBUG_LP @@ -348,6 +359,7 @@ void BoWBinaryReaderPrivate::readSimpleToken(std::istream& file, #endif lemma = inflectedForm; } +#endif LinguisticCode category; uint64_t position,length; category=static_cast(Misc::readCodedInt(file)); @@ -555,7 +567,7 @@ void BoWBinaryWriter::writeBoWDocument(std::ostream& file, { BOWLOGINIT; LERROR << "BoWBinaryWriter: writeBoWDocument non implemented"; - LERROR << "Can not write "<< doc << " into "<< file; + LERROR << "Can not write "<< doc << " into stream"<< &file; } void BoWBinaryWriter::writeBoWToken(std::ostream& file, @@ -611,30 +623,21 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, { #ifdef DEBUG_LP BOWLOGINIT; - LDEBUG << "BoWBinaryWriter::writeSimpleToken write lemma:" << &file << token->getLemma(); -#endif - if (!token->getLemma().isEmpty()) - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken non-empty lemma"; -#endif - Misc::writeUTF8StringField(file,token->getLemma()); - } - else - { -#ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken empty lemma, writing inflected form instead:" << token->getInflectedForm(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken write lemma: " << &file << token->getLemma(); #endif - Misc::writeUTF8StringField(file,token->getInflectedForm()); - } + Misc::writeUTF8StringField(file,token->getLemma()); #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken write infl: " << token->getInflectedForm(); #endif Misc::writeUTF8StringField(file,token->getInflectedForm()); Misc::writeCodedInt(file,token->getCategory()); +#ifdef ANTINNO_SPECIFIC + + // FWI 04/08/2016 : correction de length qui ne tient pas compte des entites xml dans le lemme auto beg = token->getPosition(); auto end = token->getLength() + beg; + //::std::cout << "beg: " << beg << " end: " << end << ::std::endl; if (m_shiftFrom.empty()) { @@ -648,8 +651,8 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin" << beg; LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end" << end; #endif - auto const shiftForBeginIt = m_shiftFrom.lowerBound(beg-1); - if (shiftForBeginIt == m_shiftFrom.constBegin()) + auto const it1 = m_shiftFrom.lowerBound(beg-1); + if (it1 == m_shiftFrom.constBegin()) { #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: NO shift"; @@ -658,12 +661,12 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, else { #ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (shiftForBeginIt-1).value(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (it1-1).value(); #endif - beg += (shiftForBeginIt-1).value(); + beg += (it1-1).value(); } - auto const shiftForEndIt = m_shiftFrom.lowerBound(end-1); - if (shiftForEndIt == m_shiftFrom.constBegin()) + auto const it2 = m_shiftFrom.lowerBound(end-1); + if (it2 == m_shiftFrom.constBegin()) { #ifdef DEBUG_LP LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: NO shift"; @@ -672,14 +675,46 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, else { #ifdef DEBUG_LP - LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (shiftForEndIt-1).value(); + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (it2-1).value(); #endif - end += (shiftForEndIt-1).value(); + end += (it2-1).value(); } } Misc::writeCodedInt(file, beg-1); Misc::writeCodedInt(file, end-beg); + +#else + if (m_shiftFrom.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom is empty"; +#endif + Misc::writeCodedInt(file,token->getPosition()-1); + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from" << token->getPosition(); +#endif + QMap::const_iterator it = m_shiftFrom.lowerBound(token->getPosition()-1); + if (it == m_shiftFrom.constBegin()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom NO shift"; +#endif + Misc::writeCodedInt(file,token->getPosition()-1); + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom shift by" << (it-1).value(); +#endif + Misc::writeCodedInt(file,token->getPosition()+ (it-1).value()-1); + } + } + Misc::writeCodedInt(file,token->getLength()); +#endif } void BoWBinaryWriter::writePredicate(std::ostream& file, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h index 19408baf6..5893d1947 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h @@ -79,7 +79,8 @@ class LIMA_BOW_EXPORT BoWBinaryReader void readBoWDocumentBlock(std::istream& file, BoWDocument& document, AbstractBoWDocumentHandler& handler, - bool useIterator=false); + bool useIterator, + bool useIndexIterator); boost::shared_ptr< Lima::Common::BagOfWords::AbstractBoWElement > readBoWToken(std::istream& file); void readSimpleToken(std::istream& file, boost::shared_ptr< BoWToken > token); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp index 24a3a665a..a577e852d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp @@ -237,8 +237,25 @@ boost::shared_ptr< BoWToken > BoWComplexTokenPrivate::addPart(boost::shared_ptr< if (isHead) { m_head=m_parts.size()-1; } - if (tok->getPosition() < m_position) m_position = tok->getPosition(); - if (tok->getPosition() > (m_position + m_length)) m_length = (tok->getPosition()+tok->getLength()-m_position-1); + uint64_t previousPosition = m_position; + + // added the first part + if (m_position == 0 && m_length==0) + { + m_position = tok->getPosition(); + m_length = tok->getLength(); + } + // adding a part before the previous first part + else if (tok->getPosition() < m_position) + { + m_position = tok->getPosition(); + m_length = previousPosition - tok->getPosition() + m_length; + } + // adding a part after the current end + else if (tok->getPosition() > (previousPosition + m_length)) + { + m_length = tok->getPosition() - previousPosition + tok->getLength(); + } return tok; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp index 7eaf085a7..e9211f43f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp @@ -93,10 +93,6 @@ BoWDocument::BoWDocument(const BoWDocument& d): //*********************************************************************** BoWDocument::~BoWDocument() { -#ifdef DEBUG_LP - BOWLOGINIT; - LDEBUG << "BoWDocument::~BoWDocument" << this; -#endif clear(); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h index 6c0304384..44d82ee6a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h @@ -84,7 +84,7 @@ class LIMA_BOW_EXPORT BoWNamedEntity : public BoWComplexToken */ std::string getFeaturesUTF8String(void) const; /** get a string of the BoWToken for output function */ - virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager) const; + virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager = 0) const; virtual std::string getIdUTF8String(void) const; /** diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp index e4ca102c9..3486b2049 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp @@ -80,8 +80,6 @@ BoWText& BoWText::operator = (const BoWText& t) BoWText::~BoWText() { - BOWLOGINIT; - LDEBUG << "BoWText::~BoWText()" << this; clear(); } void BoWText::writeBoWText(ostream& stream) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp index dc031c41f..547ad60f9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp @@ -273,8 +273,6 @@ BoWToken* BoWToken::clone() const //*********************************************************************** BoWToken::~BoWToken() { - BOWLOGINIT; - LDEBUG << "BoWToken::~BoWToken " << this; delete m_d; } @@ -339,10 +337,10 @@ LimaString BoWToken::getString(void) const if (m_d->m_useOnlyLemma) { -//#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'true'"; - LDEBUG << "BoWToken::getString: getLemma()=" << getLemma(); -//#endif +// #ifdef DEBUG_LP +// LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'true'"; +// LDEBUG << "BoWToken::getString: getLemma()=" << getLemma(); +// #endif return getLemma(); } else @@ -350,13 +348,13 @@ LimaString BoWToken::getString(void) const ostringstream cat; cat << m_d->m_category; //#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'false'"; +// LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'false'"; //#endif /* ostringstream len; len << m_length; return m_lemma + m_separator + LimaString(cat.str()) + m_separator + LimaString(len.str());*/ //#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: getLemma()=" << getLemma() << ", cat=" << Misc::utf8stdstring2limastring(cat.str() ); +// LDEBUG << "BoWToken::getString: getLemma()=" << getLemma() << ", cat=" << Misc::utf8stdstring2limastring(cat.str() ); //#endif return getLemma() + m_d->m_separator + Misc::utf8stdstring2limastring(cat.str()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h index 45e92a7e9..574c8d4e6 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h @@ -158,9 +158,17 @@ class LIMA_BOW_EXPORT BoWToken : public AbstractBoWElement */ virtual void addToPosition(const uint64_t offset); +#ifdef ANTINNO_SPECIFIC + friend LIMA_BOW_EXPORT ::std::ostream& ::Lima::Common::Misc::operator << (::std::ostream& os, +#else friend LIMA_BOW_EXPORT std::ostream& operator << (std::ostream& os, +#endif const Common::Misc::PositionLengthList& p); +#ifdef ANTINNO_SPECIFIC + friend LIMA_BOW_EXPORT QDebug& ::Lima::Common::Misc::operator << (QDebug& os, +#else friend LIMA_BOW_EXPORT QDebug& operator << (QDebug& os, +#endif const Common::Misc::PositionLengthList& p); static void setUseOnlyLemma(const bool b); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp index bcc5ee0c8..a6ab3d1a0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp @@ -57,6 +57,11 @@ class BoWXMLWriterPrivate friend class BoWXMLWriter; BoWXMLWriterPrivate(std::ostream& os); +// FWI 08/09/2015 : ajout de la langue en paramtre +#ifdef ANTINNO_SPECIFIC + BoWXMLWriterPrivate(std::ostream& os, Lima::MediaId const& language); +#else +#endif virtual ~BoWXMLWriterPrivate(); @@ -75,8 +80,8 @@ friend class BoWXMLWriter; void writeBoWRelation(const BoWRelation* relation); void writeComplexTokenParts(const BoWComplexToken* token); void writeBoWTokenList(const BoWText* text, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); void writeGenericDocumentProperties(const Misc::GenericDocumentProperties* prop); void writePredicateRoles(const BoWPredicate* term); template @@ -107,6 +112,18 @@ m_language(0) { } +// FWI 08/09/2015 : ajout de la langue en paramtre +#ifdef ANTINNO_SPECIFIC +BoWXMLWriterPrivate::BoWXMLWriterPrivate(std::ostream& os, Lima::MediaId const& language): +m_outputStream(os), +m_currentTokId(0), +m_spaces(""), +m_language(language) +{ +} +#else +#endif + BoWXMLWriterPrivate::~BoWXMLWriterPrivate() { } @@ -119,6 +136,15 @@ BoWXMLWriter::BoWXMLWriter(std::ostream& os): { } +// FWI 08/09/2015 : ajout de la langue en paramtre +#ifdef ANTINNO_SPECIFIC +BoWXMLWriter::BoWXMLWriter(std::ostream& os, Lima::MediaId const& language): +m_d(new BoWXMLWriterPrivate(os, language)) +{ +} +#else +#endif + BoWXMLWriter::~BoWXMLWriter() { delete m_d; @@ -159,14 +185,16 @@ void BoWXMLWriter::closeSBoWNode() { m_d->decIndent(); } -void BoWXMLWriter::processSBoWText( const BoWText* boWText, bool useIterator) { +void BoWXMLWriter::processSBoWText( const BoWText* boWText, bool useIterator, + bool useIndexIterator) { m_d->m_language = Common::MediaticData::MediaticData::single().getMediaId ( boWText->lang ); - m_d->writeBoWTokenList(boWText,useIterator); + m_d->writeBoWTokenList(boWText,useIterator,useIndexIterator); } void BoWXMLWriter::processProperties( - const Misc::GenericDocumentProperties* properties, bool /*unused useIterators*/) { + const Misc::GenericDocumentProperties* properties, bool /*unused useIterators*/, + bool /*useIndexIterator*/) { m_d->writeGenericDocumentProperties(properties); } @@ -350,31 +378,78 @@ void BoWXMLWriterPrivate::writeIndexElement( m_outputStream << "/>" << endl; return; } + if (element.isSimpleTerm()) { std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); m_outputStream << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilit avec la box + << "\" category=\"" << element.getCategory() + << "\" categoryString=\"" << cat // uniquement pour info +#else << "\" category=\"" << cat +#endif << "\" position=\"" << element.getPosition() << "\" length=\"" << element.getLength() << "\""; if (element.isNamedEntity()) { +#ifdef ANTINNO_SPECIFIC + string const neTypeAsString = Common::Misc::limastring2utf8stdstring(MediaticData::MediaticData::single().getEntityName(element.getNamedEntityType())); + m_outputStream << " neType=\"" << element.getNamedEntityType()/*xmlString(neTypeAsString)*/ << "\""; + m_outputStream << " neTypeString=\"" << xmlString(neTypeAsString) << "\""; + m_outputStream << " type=\"" << static_cast(BoWType::BOW_NAMEDENTITY) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#else m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#endif } else { +#ifdef ANTINNO_SPECIFIC + m_outputStream << " type=\"" << static_cast(BoWType::BOW_TOKEN) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_TOKEN << "\""; +#else m_outputStream << " type=\"" << BoWType::BOW_TOKEN << "\""; +#endif } m_outputStream << "/>" << endl; return; } // compound + +#ifdef ANTINNO_SPECIFIC + std::string const cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); + // FWI 15/09/2015 : ajout pour info de la chane mme pour les mots composs + // + hack pour garder la compatibilit avec la box + m_outputStream + << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) << "\"" + << " category=\"" << element.getCategory() << "\"" + << " categoryString=\"" << xmlString(cat) << "\"" /* uniquement pour info */ + << " position=\"" << element.getPosition() << "\"" + << " length=\"" << element.getLength() << "\""; +#endif + if (element.isNamedEntity()) { + +#ifdef ANTINNO_SPECIFIC + string const neTypeAsString = Common::Misc::limastring2utf8stdstring(MediaticData::MediaticData::single().getEntityName(element.getNamedEntityType())); + m_outputStream << " neType=\"" << element.getNamedEntityType() /*xmlString(neTypeAsString)*/ << "\""; + m_outputStream << " neType=\"" << xmlString(neTypeAsString) << "\""; + m_outputStream << " type=\"" << static_cast(BoWType::BOW_NAMEDENTITY) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#else m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#endif } else { +#ifdef ANTINNO_SPECIFIC + m_outputStream << " type=\"" << static_cast(BoWType::BOW_TERM) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_TERM << "\""; +#else m_outputStream << " type=\"" << BoWType::BOW_TERM << "\""; +#endif } m_outputStream << ">" << endl << m_spaces << " " << endl; @@ -410,7 +485,13 @@ void BoWXMLWriterPrivate::writeBoWToken( m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilit avec la box + << "\" category=\"" << tok->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << tok->getPosition() << "\" length=\"" << tok->getLength() << "\"" << "/>" << std::endl; @@ -437,7 +518,13 @@ void BoWXMLWriterPrivate::writeBoWToken( m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilit avec la box + << "\" category=\"" << term->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << term->getPosition() << "\" length=\"" << term->getLength() << "\"" << ">" << std::endl; @@ -454,7 +541,13 @@ void BoWXMLWriterPrivate::writeBoWToken( m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilit avec la box + << "\" category=\"" << ne->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << ne->getPosition() << "\" length=\"" << ne->getLength() << "\" type=\"" diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h index b56a8ceb0..a6936be25 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h @@ -53,16 +53,21 @@ class IndexElement; class LIMA_BOW_EXPORT BoWXMLWriter : public AbstractBoWDocumentHandler { public: - BoWXMLWriter(std::ostream& os); - virtual ~BoWXMLWriter(); +// FWI 08/09/2015 : ajout de la langue en paramtre optionnel +#ifdef ANTINNO_SPECIFIC + BoWXMLWriter(std::ostream& os, Lima::MediaId const& language); +#else +#endif + BoWXMLWriter(std::ostream& os); + virtual ~BoWXMLWriter(); void writeBoWText(const BoWText* document, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); void writeBoWToken(const BoWToken* token); void writeBoWDocument(const BoWDocument* document, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); // root tags for valid XML if several documents void writeBoWDocumentsHeader(); @@ -73,8 +78,10 @@ class LIMA_BOW_EXPORT BoWXMLWriter : public AbstractBoWDocumentHandler // Implementation of AbstractBoWXMLWriter functions void openSBoWNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName); void openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName); - void processSBoWText(const BoWText* boWText, bool useIterators); - void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators); + void processSBoWText(const BoWText* boWText, bool useIterators, + bool useIndexIterator); + void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators, + bool useIndexIterator); void closeSBoWNode(); void writeIndexElement(const IndexElement& element); void setSpaces(const std::string& s); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp index abec11f2a..4249f7b87 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp @@ -238,9 +238,9 @@ uint64_t IndexElement::getId() const { return m_d->m_id; } Lima::Common::BagOfWords::BoWType IndexElement::getType() const { return m_d->m_type; } -bool IndexElement::isSimpleTerm() const { return m_d->m_type == BoWType::BOW_TOKEN; } +bool IndexElement::isSimpleTerm() const { return m_d->m_type == BoWType::BOW_TOKEN || (m_d->m_type == BoWType::BOW_NAMEDENTITY && m_d->m_structure.empty()); } -bool IndexElement::isComposedTerm() const { return m_d->m_type == BoWType::BOW_TERM; } +bool IndexElement::isComposedTerm() const { return m_d->m_type == BoWType::BOW_TERM || (m_d->m_type == BoWType::BOW_NAMEDENTITY && ! m_d->m_structure.empty()); } bool IndexElement::isPredicate() const { return m_d->m_type == BoWType::BOW_PREDICATE; } @@ -348,9 +348,14 @@ std::ostream& operator<<(std::ostream& os, const IndexElement& elt) os << "[IndexElement" << elt.m_d->m_id << "," << elt.m_d->m_type ; if (elt.isSimpleTerm()) { os << ":" << Common::Misc::limastring2utf8stdstring(elt.m_d->m_word); +#ifdef ANTINNO_SPECIFIC + // affichage systmatique + os << "/" << elt.m_d->m_category; +#else if (elt.m_d->m_category != 0) { os << "/" << elt.m_d->m_category; } +#endif os << "/" << elt.m_d->m_position; os << "," << elt.m_d->m_length; } @@ -366,9 +371,12 @@ std::ostream& operator<<(std::ostream& os, const IndexElement& elt) os << "," << elt.m_d->m_structure[i] << " RE(" << elt.m_d->m_relations[i] << ")"; i++; } + os << "]"; } - os << "/"; - ::operator<<(os,elt.m_d->m_poslenlist); + // FWI 20/02/2015 + //os << "/"; + //::operator<<(os,elt.m_d->m_poslenlist); + os << "/" << elt.m_d->m_poslenlist; } if (! elt.m_d->m_neType.isNull()) { os << "/NE(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(elt.m_d->m_neType).toUtf8().constData() << ")"; @@ -434,8 +442,10 @@ QTextStream& operator<<(QTextStream& os, const IndexElement& elt) { i++; } } - os << "/"; - ::operator<<(os,elt.m_d->m_poslenlist); + // FWI 20/02/2015 + //os << "/"; + //::operator<<(os,elt.m_d->m_poslenlist); + os << "/" << elt.m_d->m_poslenlist; } if (! elt.m_d->m_neType.isNull()) { os << "/NE(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(elt.m_d->m_neType) << ")"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp index c1100429d..f565c5480 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp @@ -551,8 +551,15 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( BOWLOGINIT; #endif QStringList structureKey; - for (auto element: structure) { +#ifdef ANTINNO_SPECIFIC + // Modif NAN pour que a compile sous Visual 2010 + for (auto itElement=structure.begin(),it_end=structure.end(); itElement!=it_end; itElement++) { + structureKey << QString::number(*itElement); +#else + for (auto element: structure) { structureKey << QString::number(element); +#endif + } #ifdef DEBUG_CD LDEBUG << "addCombinedPartsInQueue: nb parts=" << partIdsRels.size() @@ -610,7 +617,11 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( } // add possible at end of structure and recursive call +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (auto it,partIdsRels[current].first) { +#else for (auto it:partIdsRels[current].first) { +#endif structure.push_back(it); relations.push_back(partIdsRels[current].second); if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,current+1)) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp index a3c3dcf46..5fc81099c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp @@ -24,6 +24,8 @@ #include "PropertyCodeManager.h" #include "XMLPropertyHandler.h" #include "SymbolicCodeXMLHandler.h" +// FWI 25/02/2015 inclusion pour bnficier de operator<<(osstream&, QString) +#include "common/Data/LimaString.h" #include @@ -55,16 +57,9 @@ PropertyCodeManager::PropertyCodeManager() void PropertyCodeManager::readFromXmlFile(const std::string& filename) { PROPERTYCODELOGINIT; - - // check that file exists - { - ifstream fin(filename.c_str(), std::ifstream::binary); - if (!fin.good()) { - LERROR << "invalid XMLPropertyCode file " << filename; - throw InvalidConfiguration(); - } - fin.close(); - } +#ifdef DEBUG_LP + LDEBUG << typeid(*this).name() << "PropertyCodeManager::readFromXmlFile" << filename; +#endif #ifdef DEBUG_LP LDEBUG << typeid(*this).name() << "PropertyCodeManager::readFromXmlFile before creating parser"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp index 487e63f00..47214a8f8 100755 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp @@ -112,6 +112,9 @@ void LTR_Text::binaryWriteOn(std::ostream& os) const { uint64_t tokenCounter = 0; writeCodedInt(os, this->size()); + if (this->size()==0) { + return; + } SENTENCE_BOUNDS_T:: const_iterator itSb = m_sentenceBounds.begin(); writeCodedInt(os, *itSb); for (LTR_Text::const_iterator itTok = this->begin(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h index 7ac9b8813..73b5122e1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h @@ -87,6 +87,14 @@ class LIMA_LINEARTEXTREPRESENTATION_EXPORT LTR_Text : public std::vector::clear(); + m_sentenceBounds.clear(); + m_namedEntities.clear(); + } + /** @name accessing */ //@{ SENTS_CONST_ITER_T beginSentenceBounds() const { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp index 7bd513381..4eb14e1b3 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp @@ -30,11 +30,13 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/common/PropertyCode/PropertyManager.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include #include +#include using namespace std; using namespace Lima::Common::XMLConfigurationFiles; @@ -197,18 +199,39 @@ void LanguageData::initialize( } void LanguageDataPrivate::initPropertyCode( - const std::string& resourcesPath, + const std::string& resourcesPathsStd, XMLConfigurationFileParser& conf) { LDATALOGINIT; - LINFO << "LanguageDataPrivate::initPropertyCode initializes the property coding system"; + LINFO << "LanguageDataPrivate::initPropertyCode initializes the property coding system with resources path" << resourcesPathsStd; try { - std::string propertyFile=resourcesPath + "/" + conf.getModuleGroupParamValue("LinguisticData","Categories","PropertyCodeFile"); + QStringList resourcesPaths= QString::fromUtf8(resourcesPathsStd.c_str()).split(LIMA_PATH_SEPARATOR); + bool propertyCodeFileFound = false; + QString propertyCodeFile = conf.getModuleGroupParamValue("LinguisticData","Categories","PropertyCodeFile").c_str(); + Q_FOREACH(QString resourcesPath, resourcesPaths) + { + QString propertyFile(resourcesPath + "/" + propertyCodeFile); #ifdef DEBUG_LP - LDEBUG << "LanguageDataPrivate::initPropertyCode propertyFile is:" << propertyFile; + LDEBUG << "LanguageDataPrivate::initPropertyCode trying property file" << propertyFile; #endif - m_propCodeManager.readFromXmlFile(propertyFile); + QFileInfo propertyFileInfo(propertyFile); + if (propertyFileInfo.exists()) + { +#ifdef DEBUG_LP + LDEBUG << "LanguageDataPrivate::initPropertyCode reading property file" << propertyFileInfo.filePath(); +#endif + m_propCodeManager.readFromXmlFile(propertyFileInfo.filePath().toUtf8().constData()); + propertyCodeFileFound = true; + // Read at most one property code file for a language + break; + } + } + if (!propertyCodeFileFound) + { + LERROR << "No property code file"< -*/ -/************************************************************************ - * @file positionLengthList.cpp - * @author Mesnard Olivier - * @date - * @version - * copyright Copyright (C) 2003 by CEA LIST - * - ***********************************************************************/ - - -#include - -#include "positionLengthList.h" - -using namespace std; - -QTextStream& operator << (QTextStream& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} - -std::ostream& operator << (std::ostream& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} - -QDebug& operator << (QDebug& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file positionLengthList.cpp + * @author Mesnard Olivier + * @date + * @version + * copyright Copyright (C) 2003 by CEA LIST + * + ***********************************************************************/ + + +#include + +#include "positionLengthList.h" + +using namespace std; + +#ifdef ANTINNO_SPECIFIC +// FWI 10/01/2014 : dclarations dplace dans le namespace Misc +namespace Lima { +namespace Common { +namespace Misc { +#endif + +QTextStream& operator << (QTextStream& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +std::ostream& operator << (std::ostream& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +QDebug& operator << (QDebug& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +#ifdef ANTINNO_SPECIFIC +}}} +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h b/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h index cfe7e9ec5..87afc6ab1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h @@ -46,9 +46,22 @@ namespace Misc { } // namespace Misc } // namespace Common } // namespace Lima +#ifdef ANTINNO_SPECIFIC +namespace Lima { +namespace Common { +namespace Misc { +// FWI 10/01/2014 : dclarations dplace dans le namespace Misc +LIMA_LPMISC_EXPORT QTextStream& operator << (QTextStream& os, const PositionLengthList& p); +LIMA_LPMISC_EXPORT std::ostream& operator << (std::ostream& os, const PositionLengthList& p); +LIMA_LPMISC_EXPORT QDebug& operator << (QDebug& os, const PositionLengthList& p); +} // namespace Misc +} // namespace Common +} // namespace Lima +#else LIMA_LPMISC_EXPORT QTextStream& operator << (QTextStream& os, const Lima::Common::Misc::PositionLengthList& p); LIMA_LPMISC_EXPORT std::ostream& operator << (std::ostream& os, const Lima::Common::Misc::PositionLengthList& p); LIMA_LPMISC_EXPORT QDebug& operator << (QDebug& os, const Lima::Common::Misc::PositionLengthList& p); +#endif #endif // POSITION_LENGTH_LIST_H diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp index 92688a679..f797d6778 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp @@ -35,6 +35,8 @@ #include #include +#ifdef ANTINNO_SPECIFIC +#else std::ostream& operator<<(std::ostream& oss, const QStringList& qsl) { oss << "{"; @@ -45,6 +47,7 @@ std::ostream& operator<<(std::ostream& oss, const QStringList& qsl) oss << "}"; return oss; } +#endif namespace Lima { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp index a64d5a3bc..4b1c0626e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp @@ -1,151 +1,168 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ -#include "DictionaryData.h" - -#include "common/LimaCommon.h" -#include "linguisticProcessing/LinguisticProcessingCommon.h" - -#include -#include - -#include -#include - - -using namespace std; - -namespace Lima -{ - -namespace LinguisticProcessing -{ - -namespace AnalysisDict -{ - -DictionaryData::DictionaryData() : - m_data(0), - m_entriesAddr(), - m_lingPropertiesAddr() -{} - - -DictionaryData::~DictionaryData() -{ - if (m_data) - { - delete [] m_data; - } -} - -void DictionaryData::loadBinaryFile(const std::string& file) -{ - ANALYSISDICTLOGINIT; - LDEBUG << "DictionaryData::loadBinaryFile" << file; - if( !QFileInfo(file.c_str()).exists()) -// if( !boost::filesystem3::exists(file)) - { - std::string mess = "DictionaryData::loadBinaryFile file "; - mess.append(file).append(" not found!"); - throw( std::logic_error( mess ) ); - } - uint64_t dataSize = QFileInfo(file.c_str()).size(); - LDEBUG << "DictionaryData::loadBinaryFile data size: " << dataSize; - m_data = new unsigned char [dataSize]; - if (m_data == NULL) - { - std::string mess = "DictionaryData::loadBinaryFile memory allocation error"; - throw( std::logic_error( mess ) ); - } - - // load data - FILE *dataFile = fopen(file.c_str(), "rb"); - if (dataFile == NULL) - { - std::ostringstream stro (std::ios::in | std::ios::out); - stro << "DictionaryData::loadBinaryFile error cannot open data file " << file; - throw( Lima::IncompleteResources(stro.str()) ); - } - uint64_t readSize = fread(m_data, 1, dataSize, dataFile); //_dataSize = max - fclose(dataFile); - if (readSize != dataSize) - { - std::string mess = "DictionaryData::loadBinaryFile totalDataReadSize != _dataSize "; - throw( std::logic_error( mess ) ); - } - - // parseEntries - unsigned char* p=m_data; - uint64_t nbEntries=readCodedInt(p); - m_entriesAddr.resize(nbEntries); - uint64_t read; - for (vector::iterator entryItr=m_entriesAddr.begin(); - entryItr!=m_entriesAddr.end(); - entryItr++) - { - *entryItr = p; - // go to next entry - read=readCodedInt(p); - if (read == 1) - { - // 1 means delete, next in is length - read=readCodedInt(p); - } - p += read; - } - LDEBUG << "read " << nbEntries << " entries"; - - // parseLingProperties - uint64_t nbLingProp=readCodedInt(p); - m_lingPropertiesAddr.resize(nbLingProp); - for(vector::iterator lingItr=m_lingPropertiesAddr.begin(); - lingItr!=m_lingPropertiesAddr.end(); - lingItr++) - { - *lingItr=p; - read = readCodedInt(p); - p += read; - } - LDEBUG << "read " << nbLingProp << " lingPropsSet"; - Q_ASSERT((uint64_t)(p-m_data) == dataSize); -} - -uint64_t DictionaryData::readCodedInt(unsigned char* &p) -{ - uint64_t val = 0; -// cerr << "start read" << endl; - do - { -// cerr << "val = " << val << " *p = " << (int) *p << endl; - val = (val <<7) + ((*p >> 1) & 0x7F); - } - while (*(p++) & 0x1); -// cerr << "end read val=" << val << endl; - return(val); -} - -} - -} - -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#include "DictionaryData.h" + +#include "common/LimaCommon.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" + +#include +#include + +#include +#include + +#ifdef ANTINNO_SPECIFIC +// FWI 31/10/2013 : ajout #include "antinno.ResourcesIdent.h" +#include "antinno.ResourcesIdent.h" +#endif + +using namespace std; + +namespace Lima +{ + +namespace LinguisticProcessing +{ + +namespace AnalysisDict +{ + +DictionaryData::DictionaryData() : + m_data(0), + m_entriesAddr(), + m_lingPropertiesAddr() +{} + + +DictionaryData::~DictionaryData() +{ + if (m_data) + { + delete [] m_data; + } +} + +void DictionaryData::loadBinaryFile(const std::string& file) +{ + ANALYSISDICTLOGINIT; + LDEBUG << "DictionaryData::loadBinaryFile" << file; + if( !QFileInfo(file.c_str()).exists()) +// if( !boost::filesystem3::exists(file)) + { + std::string mess = "DictionaryData::loadBinaryFile file "; + mess.append(file).append(" not found!"); + throw( std::logic_error( mess ) ); + } + uint64_t dataSize = QFileInfo(file.c_str()).size(); + LDEBUG << "DictionaryData::loadBinaryFile data size: " << dataSize; + m_data = new unsigned char [dataSize]; + if (m_data == NULL) + { + std::string mess = "DictionaryData::loadBinaryFile memory allocation error"; + throw( std::logic_error( mess ) ); + } + + // load data + FILE *dataFile = fopen(file.c_str(), "rb"); + if (dataFile == NULL) + { + std::ostringstream stro (std::ios::in | std::ios::out); + stro << "DictionaryData::loadBinaryFile error cannot open data file " << file; + throw( Lima::IncompleteResources(stro.str()) ); + } + uint64_t readSize = fread(m_data, 1, dataSize, dataFile); //_dataSize = max + fclose(dataFile); + if (readSize != dataSize) + { + std::string mess = "DictionaryData::loadBinaryFile totalDataReadSize != _dataSize "; + throw( std::logic_error( mess ) ); + } + + // parseEntries + unsigned char* p=m_data; + +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 01/03/11 Affiche l'identification Antinno si elle est presente, sinon ne fait rien + if (string((char*)p, 3) == "Ant") { + p +=3; + const std::size_t antLen = p[0] + p[1]*0x100 + p[2]*0x10000 + p[3]*0x1000000; + p +=4; + LINFO << "\n" + file + "\n" + ::antinno::ResourcesIdent((char*)p, antLen).toHumanReadableString(); + p += antLen; + } //JYS 01/03/11 + #endif + + uint64_t nbEntries=readCodedInt(p); + m_entriesAddr.resize(nbEntries); + uint64_t read; + for (vector::iterator entryItr=m_entriesAddr.begin(); + entryItr!=m_entriesAddr.end(); + entryItr++) + { + *entryItr = p; + // go to next entry + read=readCodedInt(p); + if (read == 1) + { + // 1 means delete, next in is length + read=readCodedInt(p); + } + p += read; + } + LDEBUG << "read " << nbEntries << " entries"; + + // parseLingProperties + uint64_t nbLingProp=readCodedInt(p); + m_lingPropertiesAddr.resize(nbLingProp); + for(vector::iterator lingItr=m_lingPropertiesAddr.begin(); + lingItr!=m_lingPropertiesAddr.end(); + lingItr++) + { + *lingItr=p; + read = readCodedInt(p); + p += read; + } + LDEBUG << "read " << nbLingProp << " lingPropsSet"; + Q_ASSERT((uint64_t)(p-m_data) == dataSize); +} + +uint64_t DictionaryData::readCodedInt(unsigned char* &p) +{ + uint64_t val = 0; +// cerr << "start read" << endl; + do + { +// cerr << "val = " << val << " *p = " << (int) *p << endl; + val = (val <<7) + ((*p >> 1) & 0x7F); + } + while (*(p++) & 0x1); +// cerr << "end read val=" << val << endl; + return(val); +} + +} + +} + +} diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp index 77e2a77bc..d4c551ee9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp @@ -21,6 +21,7 @@ #include "AbstractAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -91,6 +92,7 @@ EnhancedAnalysisDictionaryPrivate::EnhancedAnalysisDictionaryPrivate( EnhancedAnalysisDictionaryPrivate::~EnhancedAnalysisDictionaryPrivate() { + delete m_dicoData; } @@ -166,10 +168,11 @@ void EnhancedAnalysisDictionary::init( } try { - std::string binaryFilePath = Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("dictionaryValuesFile"); - resourceFileWatcher().addPath(QString::fromUtf8(binaryFilePath.c_str())); + QString binaryFilePath = Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), + unitConfiguration.getParamsValueAtKey("dictionaryValuesFile").c_str()); + resourceFileWatcher().addPath(binaryFilePath); QWriteLocker locker(&m_d->m_lock); - m_d->m_dicoData->loadBinaryFile(binaryFilePath); + m_d->m_dicoData->loadBinaryFile(binaryFilePath.toUtf8().constData()); } catch (NoSuchList& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp index 03a30ae85..46a0aef0f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp @@ -1,130 +1,188 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ -#include "FsaAccessResource.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/MediaticData/mediaticData.h" -#include "common/FsaAccess/FsaAccessSpare16.h" - -#include -#include -#include - -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::Common; -using namespace std; - -namespace Lima -{ -namespace Common { - namespace FsaAccess { - extern template class LIMA_FSAACCESS_EXPORT FsaAccessReader16 >,struct boost::property > >,struct boost::no_property,struct boost::no_property,struct boost::listS> >; - } -} -namespace LinguisticProcessing -{ -namespace AnalysisDict -{ - -SimpleFactory fsaAccessResourceFactory(FSAACCESSRESSOURCE_CLASSID); - -FsaAccessResource::FsaAccessResource(QObject* parent) - : AbstractAccessResource(parent),m_fsaAccess(0) -{ - connect(this,SIGNAL(resourceFileChanged(QString)),this,SLOT(accessFileChanged(QString))); -} - - -FsaAccessResource::~FsaAccessResource() -{ - if (m_fsaAccess!=0) - { - delete m_fsaAccess; - } -} - -void FsaAccessResource::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) -{ - /** @addtogroup ResourceConfiguration - * - <group name="..." class="FsaAccess"> - * -  keyFile : file containing the compiled access keys - */ - - ANALYSISDICTLOGINIT; - try - { - string keyfile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("keyFile"); - FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); - resourceFileWatcher().addPath(QString::fromUtf8(keyfile.c_str())); - QWriteLocker locker(&m_lock); - LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); - fsaAccess->read(keyfile); - m_fsaAccess=fsaAccess; - } - catch (NoSuchParam& ) - { - LERROR << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; - throw InvalidConfiguration(); - } - catch (AccessByStringNotInitialized& ) - { - LERROR << "keyfile " - << Common::MediaticData::MediaticData::single().getResourcesPath() - << "/" - << unitConfiguration.getParamsValueAtKey("keyFile") - << " no found for language " - << (int) manager->getInitializationParameters().language; - throw InvalidConfiguration(); - } -} - -AbstractAccessByString* FsaAccessResource::getAccessByString() const - { return m_fsaAccess;} - -void FsaAccessResource::accessFileChanged ( const QString & path ) -{ - ANALYSISDICTLOGINIT; - // Check if the file exists as, when a file is replaced, accessFileChanged can be triggered - // two times, when it is first suppressed and when the new version is available. One should not - // try to load the missing file - if (QFileInfo(path).exists()) - { - LINFO << "FsaAccessResource::accessFileChanged reload" << path; - FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); - QWriteLocker locker(&m_lock); - fsaAccess->read(path.toUtf8().constData()); - delete m_fsaAccess; - m_fsaAccess=fsaAccess; - Q_EMIT accessFileReloaded(m_fsaAccess); - } - else - { - LINFO << "FsaAccessResource::accessFileChanged deleted, ignoring" << path; - } -} - -} // AnalysisDict -} // LinguisticProcessing -} // Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#include "FsaAccessResource.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/MediaticData/mediaticData.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/tools/FileUtils.h" + +#include +#include +#include + +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common; +using namespace std; + +namespace Lima +{ +namespace Common { + namespace FsaAccess { + extern template class LIMA_FSAACCESS_EXPORT FsaAccessReader16 >,struct boost::property > >,struct boost::no_property,struct boost::no_property,struct boost::listS> >; + } +} +namespace LinguisticProcessing +{ +namespace AnalysisDict +{ + +SimpleFactory fsaAccessResourceFactory(FSAACCESSRESSOURCE_CLASSID); + +FsaAccessResource::FsaAccessResource(QObject* parent) + : AbstractAccessResource(parent),m_fsaAccess(0) +{ + connect(this,SIGNAL(resourceFileChanged(QString)),this,SLOT(accessFileChanged(QString))); +} + + +FsaAccessResource::~FsaAccessResource() +{ + if (m_fsaAccess!=0) + { + delete m_fsaAccess; + } +} + +void FsaAccessResource::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + /** @addtogroup ResourceConfiguration + * - <group name="..." class="FsaAccess"> + * -  keyFile : file containing the compiled access keys + */ + + ANALYSISDICTLOGINIT; + try + { + QStringList resourcesPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getResourcesPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString resPath, resourcesPaths) + { + if (QFileInfo(resPath + "/" + unitConfiguration.getParamsValueAtKey("keyFile").c_str()).exists()) + { + string keyfile= (resPath + "/" + unitConfiguration.getParamsValueAtKey("keyFile").c_str()).toUtf8().constData(); + FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); + resourceFileWatcher().addPath(QString::fromUtf8(keyfile.c_str())); + QWriteLocker locker(&m_lock); +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 09/01/11 Saute l'identification Antinno si elle est presente, sinon ne fait rien + //fsaAccess->read(keyfile); + ifstream fileIn(keyfile.c_str(), ios::in | ios::binary); + if (!fileIn.good()) { + LERROR << "cannot open file " << keyfile; + throw InvalidConfiguration(); + } + char magicNumber[3]; + fileIn.read(magicNumber, 3); + if (string(magicNumber, 3) == "Ant") + { + unsigned char intLe[4]; //UNSIGNED obligatoire + fileIn.read((char*)intLe, 4); + const std::size_t antLen = intLe[0] + intLe[1]*0x100 + intLe[2]*0x10000 + intLe[3]*0x1000000; + const std::size_t pos = fileIn.tellg(); + fileIn.seekg(pos+antLen, ios::beg); //saute l'identification Antinno + } + else + fileIn.seekg(0, ios::beg); //pas un fichier repere par Antinno + fsaAccess->read(fileIn); + //JYS 09/01/11 + LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); +#else + LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); + fsaAccess->read(keyfile); +#endif + m_fsaAccess=fsaAccess; + break; + } + } + if (!m_fsaAccess) { + LERROR << "resource file" << unitConfiguration.getParamsValueAtKey("keyFile") << "not found in path" + << Common::MediaticData::MediaticData::single().getResourcesPath(); + } + + + } + catch (NoSuchParam& ) + { +#ifdef ANTINNO_SPECIFIC + ::std::ostringstream oss; + oss << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(oss.str()); +#else + LERROR << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(); +#endif + } + catch (AccessByStringNotInitialized& ) + { +#ifdef ANTINNO_SPECIFIC + ::std::ostringstream oss; + oss << "keyfile " + << Common::MediaticData::MediaticData::single().getResourcesPath() + << "/" + << unitConfiguration.getParamsValueAtKey("keyFile") + << " no found for language " + << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(oss.str()); +#else + LERROR << "keyfile " + << Common::MediaticData::MediaticData::single().getResourcesPath() + << "/" + << unitConfiguration.getParamsValueAtKey("keyFile") + << " no found for language " + << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(); +#endif + } +} + +AbstractAccessByString* FsaAccessResource::getAccessByString() const + { return m_fsaAccess;} + +void FsaAccessResource::accessFileChanged ( const QString & path ) +{ + ANALYSISDICTLOGINIT; + // Check if the file exists as, when a file is replaced, accessFileChanged can be triggered + // two times, when it is first suppressed and when the new version is available. One should not + // try to load the missing file + if (QFileInfo(path).exists()) + { + LINFO << "FsaAccessResource::accessFileChanged reload" << path; + FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); + QWriteLocker locker(&m_lock); + fsaAccess->read(path.toUtf8().constData()); + delete m_fsaAccess; + m_fsaAccess=fsaAccess; + Q_EMIT accessFileReloaded(m_fsaAccess); + } + else + { + LINFO << "FsaAccessResource::accessFileChanged deleted, ignoring" << path; + } +} + +} // AnalysisDict +} // LinguisticProcessing +} // Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp index 447282c90..a426f6dc9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp @@ -23,6 +23,7 @@ #include "FsaRwAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/FsaAccess/FsaAccessBuilderRandom16.h" @@ -68,9 +69,35 @@ void FsaRwAccessResource::init( FsaAccess::FsaAccessBuilderRandom16* fsaAccess; try { - string keyfile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("keyFile"); + QString keyfile = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), unitConfiguration.getParamsValueAtKey("keyFile").c_str()); fsaAccess=new FsaAccess::FsaAccessBuilderRandom16(); - fsaAccess->read(keyfile); + +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 09/01/11 Saute l'identification Antinno si elle est presente, sinon ne fait rien + auto* const pFileName = keyfile.toUtf8().constData(); + ifstream fileIn(pFileName, ios::in | ios::binary); + if (!fileIn.good()) { + LERROR << "cannot open file " << pFileName; + throw InvalidConfiguration(); + } + char magicNumber[3]; + fileIn.read(magicNumber, 3); + if (string(magicNumber, 3) == "Ant") { + unsigned char intLe[4]; //UNSIGNED obligatoire + fileIn.read((char*)intLe, 4); + const std::size_t antLen = intLe[0] + intLe[1]*0x100 + intLe[2]*0x10000 + intLe[3]*0x1000000; + std::streamoff pos = fileIn.tellg(); + fileIn.seekg(pos+antLen, ios::beg); //saute l'identification Antinno + } + else fileIn.seekg(0, ios::beg); //pas un fichier repere par Antinno + fsaAccess->read(fileIn); + //JYS 09/01/11 +#else + fsaAccess->read(keyfile.toUtf8().constData()); +#endif + + m_fsaAccess=fsaAccess; m_fsaRwAccess=fsaAccess; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp index ac76bd05e..e6d279aba 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp @@ -27,6 +27,7 @@ #include "AbstractAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -85,9 +86,9 @@ void MultiLevelAnalysisDictionary::init( hasMainKeys=true; m_mainKeySize=ldico.keys->getSize(); } - string dataFile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + *dataIt; + QString dataFile = Common::Misc::findFileInPaths( Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), (*dataIt).c_str()); ldico.data=new DictionaryData(); - ldico.data->loadBinaryFile(dataFile); + ldico.data->loadBinaryFile(dataFile.toUtf8().constData()); m_dicos.push_back(ldico); keyIt++; dataIt++; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp index 17a555c65..b9670f2d1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp @@ -180,64 +180,45 @@ LimaStatusCode BowDumper::process( analysis.setData("SyntacticData",syntacticData); } - // build BoWText from the result of the analysis BoWText bowText; bowText.lang=metadata->getMetaData("Lang"); buildBoWText(annotationData, syntacticData, bowText,analysis,anagraph,posgraph); - // Exclude from the shift list XML entities preceding the offset and - // readjust positions regarding the beginning of the node being analyzed + + + +#ifdef ANTINNO_SPECIFIC + // on exclus de la liste les entits xml qui prcdent l'offset et on recalle les positions par rapport au dbut du noeud en cours d'analyse uint64_t offset = metadata->getStartOffset(); - QMap localShiftFrom; - const auto& globalShiftFrom = handler->shiftFrom(); -#ifdef DEBUG_LP - LDEBUG << "BowDumper::process offset:" << offset; - LDEBUG << "BowDumper::process globalShiftFrom:" << globalShiftFrom; -#endif - if (!globalShiftFrom.isEmpty()) + QMap shiftFrom; + auto const& m = handler->shiftFrom(); + if (!m.isEmpty()) { uint64_t diff = 0; - // start first loop at second position - auto it=globalShiftFrom.constBegin()+1; - for (; it!=globalShiftFrom.constEnd(); ++it) + for (auto it=m.constBegin()+1; it!=m.constEnd(); ++it) { -#ifdef DEBUG_LP - LDEBUG << "BowDumper::process it.key():"<= offset) break; diff = it.value(); + //::std::cout << "diff: " << diff << ::std::endl; } -#ifdef DEBUG_LP - LDEBUG << "BowDumper::process after shiftFrom loop, diff is:" << diff; -#endif - // rewind by one to not miss the first entity and then - // continue from where we stoped the shift corrections - for (it = it -1; it!=globalShiftFrom.constEnd(); ++it) - { -#ifdef DEBUG_LP - LDEBUG << "BowDumper::process it.key():"<= offset && it.value() > diff) + for (auto it=m.constBegin(); it!=m.constEnd(); ++it) + if (it.value() > diff) { - // empirical correction but seems to work - localShiftFrom.insert(it.key()+diff, it.value()-diff); + shiftFrom.insert(it.key()+diff, it.value()-diff); // empirique mais a a l'air de marcher + //::std::cout << "it.key()+diff: " << it.key()+diff << "it.value()-diff: " << it.value()-diff << ::std::endl; } - } } -#ifdef DEBUG_LP - LDEBUG << "BowDumper::process localShiftFrom:" << localShiftFrom; + BoWBinaryWriter writer(shiftFrom); +#else + BoWBinaryWriter writer(handler->shiftFrom()); #endif - BoWBinaryWriter writer(localShiftFrom); DumperStream* dstream=initialize(analysis); #ifdef DEBUG_LP - LDEBUG << "BowDumper::process writing BoW text on" << dstream->out(); + LDEBUG << "BowDumper::process writing BoW text on" << dstream->out(); #endif writer.writeBoWText(dstream->out(),bowText); delete dstream; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index c67c306f8..2f5742706 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -592,13 +592,25 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs bool createdSpecificEntity(false); // note: anaVertices size should be 0 or 1 +#ifdef ANTINNO_SPECIFIC + Q_FOREACH ( AnnotationGraphVertex anaVertex, anaVertices) +#else for ( AnnotationGraphVertex anaVertex : anaVertices) +#endif { #ifdef DEBUG_LP +#ifdef ANTINNO_SPECIFIC + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << anaVertex << " ----------------------------"; +#else LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << anaVertex; +#endif #endif std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",anaVertex,"annot"); +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (AnnotationGraphVertex matchVertex, matches) +#else for (AnnotationGraphVertex matchVertex: matches) +#endif { #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << matchVertex; @@ -630,7 +642,11 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement there are " << matches.size() << " annotation graph vertices matching the current PsGraph vertex " << v; #endif +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (AnnotationGraphVertex vx, matches) +#else for (AnnotationGraphVertex vx: matches) +#endif { #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << vx; @@ -674,7 +690,11 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs bool toKeep = true; if (data!=0) { + #ifdef ANTINNO_SPECIFIC + Q_FOREACH (const auto& elem, *data) +#else for (const auto& elem: *data) +#endif { if (!keepAnyway && !shouldBeKept(elem)) { @@ -685,7 +705,11 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs } if (toKeep) { +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (boost::shared_ptr< BoWPredicate> bP, createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway)) +#else for (boost::shared_ptr< BoWPredicate >& bP: createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway)) +#endif { if (bP!=0) { @@ -1142,8 +1166,8 @@ boost::shared_ptr< BoWNamedEntity > BowGenerator::createSpecificEntity( QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, const AnnotationData* annotationData, const LinguisticGraph& anagraph, const LinguisticGraph& posgraph, const uint64_t offset, std::set< LinguisticGraphVertex >& visited, bool keepAnyway) const { - DUMPERLOGINIT; #ifdef DEBUG_LP + DUMPERLOGINIT; LDEBUG << "BowGenerator::createPredicate ling:" << lgv << "; annot:" << agv; #endif QList< boost::shared_ptr< BoWPredicate > > result; @@ -1152,10 +1176,12 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( // FIXME handle the ambiguous case when there is several class values for the predicate QStringList predicateIds=annotationData->stringAnnotation(agv,Common::Misc::utf8stdstring2limastring("Predicate")).split("|"); +#ifdef DEBUG_LP if (predicateIds.size()>1) { LDEBUG << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; } +#endif // FIXED replace the hardcoded VerbNet by a value from configuration @@ -1168,9 +1194,9 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( try { EntityType predicateEntity= Common::MediaticData::MediaticData::single().getEntityType(predicate); - #ifdef DEBUG_LP +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate The role(s) related to "<< predicate << " is/are "; - #endif +#endif AnnotationGraph annotGraph=annotationData->getGraph(); AnnotationGraphOutEdgeIt outIt, outIt_end; boost::tie(outIt, outIt_end) = boost::out_edges(agv, annotationData->getGraph()); @@ -1181,9 +1207,18 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( // FIXME handle the ambiguous case when there is several values for each role const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); + if (predicateIds.size() != semRoleIds.size()) + { + DUMPERLOGINIT; + LERROR << "BowGenerator::createPredicate predicateIds and semRoleIds sizes are different:" << predicateIds.size() << "and" << semRoleIds.size(); + LERROR << "BowGenerator::createPredicate abort this predicate creation"; + return result; + } Q_ASSERT(predicateIds.size() == semRoleIds.size()); LimaString semRole = semRoleIds[i]; +#ifdef DEBUG_LP LDEBUG << semRole; +#endif if (semRole.isEmpty()) continue; try { @@ -1194,16 +1229,17 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( LinguisticGraphVertex posGraphSemRoleVertex = *(posGraphSemRoleVertices.begin()); if (posGraphSemRoleVertex == lgv) { + DUMPERLOGINIT; LERROR << "BowGenerator::createPredicate role vertex is the same as the trigger vertex. Abort this role."; continue; } - #ifdef DEBUG_LP +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate Calling createAbstractBoWElement on PoS graph vertex" << posGraphSemRoleVertex; - #endif +#endif std::vector, boost::shared_ptr< AbstractBoWElement > > > semRoleTokens = createAbstractBoWElement(posGraphSemRoleVertex, anagraph,posgraph, offset, annotationData, visited, keepAnyway); - #ifdef DEBUG_LP +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate Created "<< semRoleTokens.size()<<"token for the role associated to " << predicate; - #endif +#endif // if (semRoleTokens[0].second!="") if (!semRoleTokens.empty()) { @@ -1212,13 +1248,14 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( } else { - #ifdef DEBUG_LP +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate Found no matching for the semRole in the annot graph"; - #endif +#endif } } catch (const Lima::LimaException& e) { + DUMPERLOGINIT; LERROR << "BowGenerator::createPredicate Unknown semantic role" << semRole << ";" << e.what(); } } @@ -1227,7 +1264,9 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( bowP->setLength(token->length()); bowP->setPredicateType(predicateEntity); Common::MediaticData::EntityType pEntityType=bowP->getPredicateType(); +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate Created a Predicate for the verbal class " << Common::MediaticData::MediaticData::single().getEntityName(pEntityType); +#endif if (!roles.empty()) { bowP->setRoles(roles); @@ -1239,9 +1278,9 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( if (outputRoles != 0) { LimaString roleLabel=Common::MediaticData::MediaticData::single().getEntityName(it.key()); - #ifdef DEBUG_LP +#ifdef DEBUG_LP LDEBUG << "BowGenerator::createPredicate Associated "<< QString::fromUtf8(outputRoles->getOutputUTF8String().c_str()) << " to it" << "via the semantic role label "<< roleLabel ; - #endif +#endif } } } @@ -1249,6 +1288,7 @@ QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( } catch (const Lima::LimaException& e) { + DUMPERLOGINIT; LERROR << "BowGenerator::createPredicate Unknown predicate" << predicate << ";" << e.what(); return QList< boost::shared_ptr< BoWPredicate > >(); } @@ -1409,6 +1449,13 @@ std::vector BowGenerator::createNEParts( #ifdef DEBUG_LP DUMPERLOGINIT; #endif + +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: createNEParts(...)"; +#endif +#endif + const LinguisticGraph& graph = (frompos?posgraph:anagraph); const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); @@ -1500,6 +1547,12 @@ std::vector BowGenerator::createNEParts( const Token* token = get(vertex_token, graph, *m); const MorphoSyntacticData* data = get(vertex_data, graph, *m); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: createNEParts(...) token->form(): " << token->form(); +#endif +#endif + if (data!=0 && !data->empty()) { const LinguisticElement& elem=*(data->begin()); @@ -1527,6 +1580,13 @@ std::vector BowGenerator::createNEParts( category, token->position(), token->length())); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: token->stringForm(): " << token->stringForm(); + LDEBUG << "BowGenerator: sp[/*elem.normalizedForm*/ " << elem.normalizedForm << "]: \"" << sp[elem.normalizedForm] << "\""; +#endif +#endif + } } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp index 35d2a67cb..48d2e4743 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp @@ -489,7 +489,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() { relationsToFollow.insert("SujInv"); relationsToFollow.insert("TIl"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "NV"); if(newGrp == 0) { @@ -536,7 +536,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() else if ( forme->micro == "ADV" && forme->hasOutRelation("AdvSub") ) { relationsToFollow.insert("AdvSub"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "GP"); if (newGrp == 0) { @@ -565,7 +565,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() else if ( forme->micro == "NC" && forme->hasInRelation("SUBSUBJUX") ) { relationsToFollow.insert("SUBSUBJUX"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "GN", true); } else if ( forme->micro == "PROREL" ) @@ -644,7 +644,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() { relationsToFollow.insert("PronReflVerbe"); relationsToFollow.insert("AuxCplPrev"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "NV"); } else if ( forme->micro == "PROREL" ) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp index 80a68b9c1..d358243e4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp @@ -370,6 +370,7 @@ xmlOutput(std::ostream& out, { // no sentence bounds : dump all text at once xmlOutputVertices(out, + analysis, anagraph, posgraph, annotationData, @@ -394,12 +395,13 @@ xmlOutput(std::ostream& out, // if (sentenceEnd==posgraph->lastVertex()) { // continue; // } - + LDEBUG << "dump sentence between " << sentenceBegin << " and " << sentenceEnd; LDEBUG << "dump simple terms for this sentence"; ostringstream oss; xmlOutputVertices(oss, + analysis, anagraph, posgraph, annotationData, @@ -424,6 +426,7 @@ xmlOutput(std::ostream& out, void GenericXmlDumper:: xmlOutputVertices(std::ostream& out, + AnalysisContent& analysis, AnalysisGraph* anagraph, AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData, @@ -509,7 +512,7 @@ xmlOutputVertices(std::ostream& out, continue; }*/ ostringstream oss; - xmlOutputVertex(oss,(*d),anagraph,posgraph,annotationData,syntacticData, + xmlOutputVertex(oss,analysis,(*d),anagraph,posgraph,annotationData,syntacticData, sp,offset,visited,alreadyStoredVertices); uint64_t pos=(*it).first->position(); xmlOutputs[pos].push_back(oss.str()); @@ -526,6 +529,7 @@ xmlOutputVertices(std::ostream& out, void GenericXmlDumper:: xmlOutputVertex(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, AnalysisGraph* anagraph, AnalysisGraph* posgraph, @@ -545,7 +549,7 @@ xmlOutputVertex(std::ostream& out, se=checkSpecificEntity(v,anagraph,posgraph,annotationData); if (se.first!=0) { LDEBUG << "GenericXmlDumper: -- is a specific entity "; - if (xmlOutputSpecificEntity(out,se.first,se.second,sp,offset)) { + if (xmlOutputSpecificEntity(out,analysis,se.first,se.second,sp,offset)) { return; } else { @@ -561,7 +565,7 @@ xmlOutputVertex(std::ostream& out, if (compoundTokens.size()!=0) { for (auto it=compoundTokens.begin(), it_end=compoundTokens.end();it!=it_end;it++) { - xmlOutputCompound(out,(*it),anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,(*it),anagraph,posgraph,annotationData,sp,offset); std::set bowTokenVertices = (*it)->getVertices(); alreadyStoredVertices.insert(bowTokenVertices.begin(), bowTokenVertices.end()); } @@ -571,7 +575,7 @@ xmlOutputVertex(std::ostream& out, LDEBUG << "GenericXmlDumper: -- is simple word "; // if not a specific entity nor a compound, output simple word infos if (m_outputWords) { - xmlOutputVertexInfos(out, v, posgraph, offset); + xmlOutputVertexInfos(out, analysis, v, posgraph, offset); } } @@ -622,6 +626,7 @@ GenericXmlDumper::checkSpecificEntity(LinguisticGraphVertex v, bool GenericXmlDumper:: xmlOutputSpecificEntity(std::ostream& out, + AnalysisContent& analysis, const SpecificEntities::SpecificEntityAnnotation* se, LinguisticAnalysisStructure::AnalysisGraph* graph, const FsaStringsPool& sp, @@ -641,7 +646,7 @@ xmlOutputSpecificEntity(std::ostream& out, string value=xmlString(specificEntityFeature(se,m_featureNames[i],sp,offset)); if (value.empty()) { // otherwise, get features from head - value=xmlString(m_features[i]->getValue(graph,se->getHead())); + value=xmlString(m_features[i]->getValue(graph,se->getHead(),analysis)); } out << " " << m_featureTags[i] << "=\"" << value << "\""; } @@ -653,7 +658,7 @@ xmlOutputSpecificEntity(std::ostream& out, for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); m != se->m_vertices.end(); m++) { - xmlOutputVertexInfos(out,(*m),graph,offset); + xmlOutputVertexInfos(out,analysis,(*m),graph,offset); } out << "" << endl; } @@ -667,7 +672,7 @@ xmlOutputSpecificEntity(std::ostream& out, for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); m != se->m_vertices.end(); m++) { - xmlOutputVertexInfos(out,(*m),graph,offset); + xmlOutputVertexInfos(out,analysis,(*m),graph,offset); } } @@ -732,6 +737,7 @@ checkCompound(LinguisticGraphVertex v, void GenericXmlDumper:: xmlOutputCompound(std::ostream& out, + AnalysisContent& analysis, boost::shared_ptr token, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, @@ -775,7 +781,7 @@ xmlOutputCompound(std::ostream& out, while (! bit.isAtEnd()) { boost::shared_ptr< AbstractBoWElement > tok=bit.getElement(); LDEBUG << "next token=" << tok->getOutputUTF8String(); - xmlOutputCompound(out,tok,anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,tok,anagraph,posgraph,annotationData,sp,offset); bit++; } } @@ -784,7 +790,7 @@ xmlOutputCompound(std::ostream& out, boost::shared_ptr< BoWTerm > term=boost::dynamic_pointer_cast(token); const std::deque< BoWComplexToken::Part >& parts=term->getParts(); for (auto p=parts.begin(),p_end=parts.end();p!=p_end;p++) { - xmlOutputCompound(out,(*p).getBoWToken(),anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,(*p).getBoWToken(),anagraph,posgraph,annotationData,sp,offset); } } @@ -804,7 +810,7 @@ xmlOutputCompound(std::ostream& out, LERROR << "GenericXmlDumper: for vertex " << v << ": specific entity not found"; } else { - xmlOutputSpecificEntity(out,se.first,se.second,sp,offset); + xmlOutputSpecificEntity(out,analysis,se.first,se.second,sp,offset); } } break; @@ -813,7 +819,7 @@ xmlOutputCompound(std::ostream& out, if (m_outputCompoundParts) { LinguisticGraphVertex v=boost::dynamic_pointer_cast(token)->getVertex(); LDEBUG << "GenericXmlDumper: output BoWToken of vertex " << v; - xmlOutputVertexInfos(out,v,posgraph,offset); + xmlOutputVertexInfos(out,analysis,v,posgraph,offset); } break; } @@ -826,6 +832,7 @@ xmlOutputCompound(std::ostream& out, } void GenericXmlDumper::xmlOutputVertexInfos(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, LinguisticAnalysisStructure::AnalysisGraph* graph, uint64_t offset) const @@ -835,14 +842,14 @@ void GenericXmlDumper::xmlOutputVertexInfos(std::ostream& out, std::string value; // for position, correct with offset : hard coded name if (m_features[i]->getName()=="position") { - unsigned int pos=atoi(m_features[i]->getValue(graph,v).c_str()); + unsigned int pos=atoi(m_features[i]->getValue(graph,v,analysis).c_str()); pos+=offset; ostringstream oss; oss << pos; value=oss.str(); } else { - value=xmlString(m_features[i]->getValue(graph,v)); + value=xmlString(m_features[i]->getValue(graph,v,analysis)); } out << " " << m_featureTags[i] << "=\"" << value << "\""; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h index bb0691e26..3ea594732 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h @@ -107,6 +107,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const SyntacticAnalysis::SyntacticData* syntacticData) const; void xmlOutputVertices(std::ostream& out, + AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData, @@ -117,6 +118,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const uint64_t offset) const; void xmlOutputVertex(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, @@ -127,10 +129,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly std::set& visited, std::set& alreadyStoredVertices) const; - void xmlOutputVertexInfos(std::ostream& out, - LinguisticGraphVertex v, - LinguisticAnalysisStructure::AnalysisGraph* anagraph, - uint64_t offset) const; + void xmlOutputVertexInfos(std::ostream& out, Lima::AnalysisContent& analysis, LinguisticGraphVertex v, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* graph, uint64_t offset) const; void xmlOutputBoWInfos(std::ostream& out, Common::BagOfWords::AbstractBoWElement* token, @@ -149,6 +148,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const Common::AnnotationGraphs::AnnotationData* annotationData) const; bool xmlOutputSpecificEntity(std::ostream& out, + AnalysisContent& analysis, const SpecificEntities::SpecificEntityAnnotation* se, LinguisticAnalysisStructure::AnalysisGraph* anagraph, const FsaStringsPool& sp, @@ -170,7 +170,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly uint64_t offset, std::set& visited) const; - void xmlOutputCompound(std::ostream& out, boost::shared_ptr token, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* anagraph, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Lima::Common::AnnotationGraphs::AnnotationData* annotationData, const Lima::FsaStringsPool& sp, uint64_t offset) const; + void xmlOutputCompound(std::ostream& out, + AnalysisContent& analysis, + boost::shared_ptr token, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* anagraph, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Lima::Common::AnnotationGraphs::AnnotationData* annotationData, const Lima::FsaStringsPool& sp, uint64_t offset) const; /*void xmlOutputVertexInfos(std::ostream& out, const LinguisticAnalysisStructure::Token* ft, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp index 970c05dff..8edbb52ec 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp @@ -78,10 +78,28 @@ LTRTextBuilder::LTRTextBuilder( void LTRTextBuilder::buildLTRTextFrom( const LinguisticGraph& graph, SegmentationData* sb, + const LinguisticGraphVertex& graphFirstVertex, const LinguisticGraphVertex& graphLastVertex, LTR_Text* textRep, uint64_t offset) { + if (sb==0) { + // no segmentation data: add tokens from all text + uint64_t tokenCounter = 0; + this->addTokensToLTRTextFrom( + graph, + graphFirstVertex, // from first vertex + graphLastVertex, // to last vertex + graphLastVertex, + textRep, + offset, + &tokenCounter); + // add a global sentence boundary (thay covers all the text) + DUMPERLOGINIT; + LDEBUG << "LTR: add sentence bound at token" << tokenCounter; + textRep->addSentenceBound(tokenCounter); + } + else { // ??OME2 SegmentationData::iterator sbIt = sb->begin(); std::vector::iterator sbIt = (sb->getSegments()).begin(); uint64_t tokenCounter = 0; @@ -91,8 +109,8 @@ void LTRTextBuilder::buildLTRTextFrom( LinguisticGraphVertex sentenceEnd = sbIt->getLastVertex(); this->addTokensToLTRTextFrom( graph, - sentenceBegin, - sentenceEnd, + sentenceBegin, // from sentence beginning + sentenceEnd, // to sentence end graphLastVertex, textRep, offset, @@ -100,6 +118,7 @@ void LTRTextBuilder::buildLTRTextFrom( textRep->addSentenceBound(tokenCounter); sbIt ++; } + } } void LTRTextBuilder::addTokensToLTRTextFrom( diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h index f2ca6ed41..cdd776dd8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h @@ -87,9 +87,18 @@ class LIMA_ANALYSISDUMPERS_EXPORT LTRTextBuilder { LTRTextBuilder( const MediaId& language, StopList* stopList); + /** @brief build a LTRText representation of the analyzed text + * @param graph the linguistic graph containing the analyzed text + * @param sb a pointer on the sentence boundaries segmentation data: if zero, sentence boundaries are ignored, all text is treated as a single segment + * @param graphFirstVertex the first vertex of the text in the linguistic graph (needed when sb==0) + * @param graphLastVertex the last vertex of the text in the linguistic graph (for last segment) + * @param textRep the LTRText built + * @param offset the offset of the text in the document (to have a global correct position) + */ void buildLTRTextFrom( const LinguisticGraph& graph, Lima::LinguisticProcessing::SegmentationData* sb, + const LinguisticGraphVertex& graphFirstVertex, const LinguisticGraphVertex& graphLastVertex, Lima::Common::BagOfWords::LTR_Text* textRep, uint64_t offset); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp index 3615aa3fc..13de6bc7a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/SimpleXmlDumper.cpp @@ -167,9 +167,8 @@ xmlOutput(std::ostream& out, AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData) const { -#ifdef DEBUG_LP DUMPERLOGINIT; -#endif + out << "" << endl; LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); @@ -180,7 +179,6 @@ xmlOutput(std::ostream& out, if (sb==0) { - DUMPERLOGINIT; LWARN << "no SentenceBoundaries"; } @@ -202,9 +200,7 @@ xmlOutput(std::ostream& out, { // ??OME2 uint64_t nbSentences(sb->size()); uint64_t nbSentences((sb->getSegments()).size()); -#ifdef DEBUG_LP LDEBUG << "SimpleXmlDumper: "<< nbSentences << " sentences found"; -#endif for (uint64_t i=0; igetStartOffset()); string str=oss.str(); if (str.empty()) { -#ifdef DEBUG_LP LDEBUG << "nothing to dump in this sentence"; -#endif } else { out << "" << endl @@ -256,10 +249,10 @@ xmlOutputVertices(std::ostream& out, const uint64_t offset) const { -#ifdef DEBUG_LP DUMPERLOGINIT; - LDEBUG << "SimpleXmlDumper::xmlOutputVertices from vertex " << begin << " to vertex " << end; -#endif + LDEBUG << "SimpleXmlDumper: ========================================"; + LDEBUG << "SimpleXmlDumper: outputXml from vertex " << begin << " to vertex " << end; + LinguisticGraph* graph=posgraph->getGraph(); LinguisticGraphVertex lastVertex=posgraph->lastVertex(); @@ -340,10 +333,6 @@ xmlOutputVertex(std::ostream& out, const FsaStringsPool& sp, uint64_t offset) const { -#ifdef DEBUG_LP - DUMPERLOGINIT; - LDEBUG << "SimpleXmlDumper::xmlOutputVertex" << v; -#endif MorphoSyntacticData* data=get(vertex_data,*(posgraph->getGraph()),v); // first, check if vertex corresponds to a specific entity found before pos tagging (i.e. in analysis graph) @@ -352,25 +341,16 @@ xmlOutputVertex(std::ostream& out, for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); anaVerticesIt != anaVertices.end(); anaVerticesIt++) { -#ifdef DEBUG_LP - LDEBUG << "SimpleXmlDumper::xmlOutputVertex AnalysisGraph vertex for" << v << "is" << *anaVerticesIt; -#endif std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) { AnnotationGraphVertex vx=*it; -#ifdef DEBUG_LP - LDEBUG << "SimpleXmlDumper::xmlOutputVertex vertex" << v << "," << *anaVerticesIt << "has annot vertex" << vx; -#endif if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { const SpecificEntityAnnotation* se = annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). pointerValue(); -#ifdef DEBUG_LP - LDEBUG << "SimpleXmlDumper::xmlOutputVertex annot vertex" << vx << "has SpecificEntity annotation"; -#endif if (outputSpecificEntity(out,se,data,anagraph->getGraph(),sp,offset)) { return; } @@ -505,10 +485,9 @@ outputSpecificEntity(std::ostream& out, // take as category for parts the category for the named entity LinguisticCode category=m_propertyAccessor->readValue(data->begin()->properties); -#ifdef DEBUG_LP DUMPERLOGINIT; LDEBUG << "Using category " << m_propertyManager->getPropertySymbolicValue(category) << " for specific entity of type " << typeName; -#endif + // get the parts of the named entity match // use the category of the named entity for all elements for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); @@ -533,7 +512,7 @@ std::string SimpleXmlDumper::xmlString(const std::string& inputStr) const replace(str,"<", "<"); replace(str,">", ">"); replace(str,"\"", """); - replace(str,"\n", ""); + replace(str,"\n", "\n"); return str; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp index 9ab26e25c..2777d2bb7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp @@ -24,6 +24,7 @@ #include "StopList.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" @@ -63,10 +64,10 @@ void StopList::init( LIMA_UNUSED(manager); DUMPERLOGINIT; const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string stopListFileName; + QString stopListFileName; try { - stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file"); + stopListFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("file").c_str()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -74,7 +75,7 @@ void StopList::init( throw InvalidConfiguration(); } - std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary); + std::ifstream stopListFile(stopListFileName.toUtf8().constData(), std::ifstream::binary); if (!stopListFile) { LERROR << "invalid file " << stopListFileName; throw InvalidConfiguration(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp index ae3dc05b8..fad91d30d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp @@ -109,7 +109,7 @@ void TextFeaturesDumper::init(Common::XMLConfigurationFiles::GroupConfigurationS } LimaStatusCode TextFeaturesDumper::process( - AnalysisContent& analysis) const + AnalysisContent& analysis) const { DUMPERLOGINIT; LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); @@ -166,7 +166,7 @@ LimaStatusCode TextFeaturesDumper::process( ftItr!=categoriesMapping.end(); ftItr++) { - outputVertex(dstream->out(),anagraph,ftItr->second,metadata->getStartOffset()); + outputVertex(dstream->out(),anagraph,ftItr->second,analysis,metadata->getStartOffset()); } delete dstream; @@ -175,10 +175,7 @@ LimaStatusCode TextFeaturesDumper::process( void TextFeaturesDumper:: -outputVertex(std::ostream& out, - const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v, - uint64_t /*offset*/) const +outputVertex(ostream& out, const AnalysisGraph* graph, LinguisticGraphVertex v, AnalysisContent& analysis, uint64_t offset /*offset*/) const { //TODO : use offset bool first=true; @@ -190,7 +187,7 @@ outputVertex(std::ostream& out, out << m_sep; } // take only first morphosyntactic data - string str=(*it)->getValue(graph,v); + string str=(*it)->getValue(graph,v,analysis); boost::replace_all(str,m_sep,m_sepReplace); out << str; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h index 4d517f1e3..8d9874a95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h @@ -70,6 +70,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT TextFeaturesDumper : public AbstractTextualAna void outputVertex(std::ostream& out, const LinguisticAnalysisStructure::AnalysisGraph* graph, LinguisticGraphVertex v, + AnalysisContent& analysis, uint64_t offset=0) const; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp index 3e4e79218..6059b010c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp @@ -63,6 +63,8 @@ FeatureExtractorFactory FeatureLemmaFactory(FeatureLemma_ID); FeatureExtractorFactory FeaturePropertyFactory(FeatureProperty_ID); FeatureExtractorFactory FeatureTstatusFactory(FeatureTstatus_ID); FeatureExtractorFactory FeatureSpecificEntityFactory(FeatureSpecificEntity_ID); +FeatureExtractorFactory FeatureLemmaSpecificEntityFactory(FeatureLemmaSpecificEntity_ID); +FeatureExtractorFactory FeatureStoredDataFactory(FeatureStoredData_ID); //*********************************************************************** // Feature list @@ -79,12 +81,12 @@ m_language(language) WordFeatures::~WordFeatures() { - for (WordFeatures::iterator it=begin(),it_end=end(); it!=it_end; it++) { - if (*it) { - delete (*it); - *it=0; - } - } +// for (WordFeatures::iterator it=begin(),it_end=end(); it!=it_end; it++) { +// if (*it) { +// delete (*it); +// *it=0; +// } +// } } void WordFeatures::initialize(const deque& featureNames) @@ -115,7 +117,8 @@ AbstractFeatureExtractor(language,complement) std::string FeaturePosition:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -133,7 +136,9 @@ AbstractFeatureExtractor(language,complement) std::string FeatureToken:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/ + ) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -152,7 +157,9 @@ m_sp() std::string FeatureLemma:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/ + ) const { MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); if (data==0) { @@ -180,7 +187,8 @@ m_propertyManager(0) std::string FeatureProperty:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); if (data==0) { @@ -203,7 +211,8 @@ AbstractFeatureExtractor(language,complement) std::string FeatureTstatus:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -220,70 +229,96 @@ AbstractFeatureExtractor(language,complement) } std::string FeatureSpecificEntity:: -getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis + ) const { - std::string typeName(""); - std::map::const_iterator itMSS; - int isPresent; - - std::set< AnnotationGraphVertex > anaVertices = annot->matches("PosGraph",v,"AnalysisGraph"); - if (anaVertices.size()==0) { - return "NAN" ; - } - // note: anaVertices size should be 0 or 1 - for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); - anaVerticesIt != anaVertices.end(); anaVerticesIt++) + std::string typeName("NAN"); + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + + std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) + { + if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - std::set< AnnotationGraphVertex > matches = annot->matches("AnalysisGraph",*anaVerticesIt,"annot"); - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) - { - AnnotationGraphVertex vx=*it; - if (annot->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - const SpecificEntityAnnotation* se = - annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). - pointerValue(); - try { - LimaString str= MediaticData::single().getEntityName(se->getType()); - typeName=Common::Misc::limastring2utf8stdstring(str); - } - catch (std::exception& ) { - DUMPERLOGINIT; - LERROR << "Undefined entity type " << se->getType() << LENDL; - LERROR << "failed to output specific entity for vertex " << v << LENDL; - } - - - } else { - // we don't find any entity - return "NAN"; - } - } + AnnotationGraphVertex vx=*it; + const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + + LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + typeName=Common::Misc::limastring2utf8stdstring(str); } - - // Test if the finded type is selected - isPresent=0; // by default, an unfinded entity isn't dumped - itMSS=m_NEauthorized.find(typeName); - if (itMSS!=m_NEauthorized.end()) { - isPresent=atoi(((*itMSS).second).c_str()); - } - if (isPresent) { - return typeName; - } else { - return "NAN"; } + return typeName; } - -void FeatureSpecificEntity::setNEauthorized(std::map mp) { - - m_NEauthorized = mp; +//*********************************************************************** +FeatureLemmaSpecificEntity::FeatureLemmaSpecificEntity(MediaId language, const std::string& complement): +AbstractFeatureExtractor(language,complement) +{ } +std::string FeatureLemmaSpecificEntity:: +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis +) const +{ + std::string mxvalue("NAN"); + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + + std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) + { + if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + AnnotationGraphVertex vx=*it; + const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + + LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + mxvalue=Common::Misc::limastring2utf8stdstring(str); + } + } + // replace NAN values by lemmas + if (mxvalue == "NAN") { + MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); + // take first + for (MorphoSyntacticData::const_iterator it=data->begin(),it_end=data->end();it!=it_end;it++) { + mxvalue = Common::Misc::limastring2utf8stdstring((*&(Common::MediaticData::MediaticData::single().stringsPool(m_language)))[(*it).normalizedForm]); + break; + } + } + // replace empty lemma values by tokens + if (mxvalue == "" ) { + Token* token=get(vertex_token,*(graph->getGraph()),v); + mxvalue = Common::Misc::limastring2utf8stdstring(token->stringForm()); + } + + return mxvalue; +} +//*********************************************************************** +FeatureStoredData::FeatureStoredData(MediaId language, const std::string& complement): +AbstractFeatureExtractor(language,complement) +{ +} +std::string FeatureStoredData:: +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis) const +{ + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + Token* token=get(vertex_token,*(graph->getGraph()),v); + if (token==0) { + return ""; + } + ostringstream oss; + oss << token->position() ; + return oss.str(); +} } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h index 433aa2b59..7bfe5a843 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h @@ -50,7 +50,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT AbstractFeatureExtractor virtual std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const=0; + LinguisticGraphVertex v, AnalysisContent &analysis) const=0; const std::string& getName() { return m_name; } void setName(const std::string& name) { m_name=name; } @@ -112,7 +112,8 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeaturePosition : public AbstractFeatureExtrac std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & ) const; }; //---------------------------------------------------------------------- @@ -124,7 +125,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureToken : public AbstractFeatureExtractor std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; }; //---------------------------------------------------------------------- @@ -136,7 +139,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureLemma : public AbstractFeatureExtractor std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; private: const FsaStringsPool* m_sp; }; @@ -150,7 +155,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureProperty : public AbstractFeatureExtrac std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; private: std::string m_propertyName; const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; @@ -166,7 +173,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureTstatus : public AbstractFeatureExtract std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; }; //-------------------------------------------------------- @@ -178,15 +187,35 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureSpecificEntity : public AbstractFeature ~FeatureSpecificEntity() {} std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; - - void setNEauthorized(std::map mp); - - Common::AnnotationGraphs::AnnotationData* annot; - std::map m_NEauthorized; + LinguisticGraphVertex v, + AnalysisContent &) const; +}; +//-------------------------------------------------------- +#define FeatureLemmaSpecificEntity_ID "lemmaSpecificEntity" +class LIMA_ANALYSISDUMPERS_EXPORT FeatureLemmaSpecificEntity : public AbstractFeatureExtractor +{ +public: + FeatureLemmaSpecificEntity(MediaId language, const std::string& complement=""); + ~FeatureLemmaSpecificEntity() {} + + std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &) const; }; +//-------------------------------------------------------- +#define FeatureStoredData_ID "storedData" +class LIMA_ANALYSISDUMPERS_EXPORT FeatureStoredData : public AbstractFeatureExtractor +{ +public: + FeatureStoredData(MediaId language, const std::string& complement=""); + ~FeatureStoredData() {} + + std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &) const; +}; } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp index 7234afa11..19b882a98 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp @@ -130,8 +130,8 @@ LimaStatusCode LinearTextRepresentationDumper::process( // get sentence boundaries SegmentationData* sb = dynamic_cast(analysis.getData("SentenceBoundaries")); if (sb == 0) { - LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort"; - return MISSING_DATA; + LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored"; + // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there } // build LTRText LTR_Text textRep; @@ -139,6 +139,7 @@ LimaStatusCode LinearTextRepresentationDumper::process( builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, + anaGraph->firstVertex(), anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp index b2a68bcce..1f692e1e8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp @@ -129,8 +129,8 @@ LimaStatusCode LinearTextRepresentationLogger::process( // get sentence boundaries SegmentationData* sb = dynamic_cast(analysis.getData("SentenceBoundaries")); if (sb == 0) { - LERROR << "no SentenceBounds ! abort"; - return MISSING_DATA; + LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored"; + // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there } // build LTRText LTR_Text textRep; @@ -138,6 +138,7 @@ LimaStatusCode LinearTextRepresentationLogger::process( builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, + anaGraph->firstVertex(), anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp index cf3ecd874..035f134e7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp @@ -75,6 +75,13 @@ bool EntityFeature::operator==(const EntityFeature& f) const return (boost::any_cast(m_value)==boost::any_cast(f.m_value)); } if (type==typeid(LimaString)) { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT + LDEBUG << "EntityFeature::operator==(EntityFeature& f): f.value: " << boost::any_cast(f.m_value); + LDEBUG << "EntityFeature::operator==(EntityFeature& f): this.value: " << boost::any_cast(m_value); +#endif +#endif return (boost::any_cast(m_value)==boost::any_cast(f.m_value)); } if (type==typeid(double)) { @@ -263,6 +270,18 @@ std::ostream& operator<<(std::ostream& os, const EntityFeatures& f) { return os; } +QDebug& operator<<(QDebug& os, const EntityFeatures& f) { + if (f.empty()) { + return os; + } + EntityFeatures::const_iterator it=f.begin(),it_end=f.end(); + os << (*it).getName() << "=" << (*it).getValueString(); + for (it++; it!=it_end; it++) { + os << "/" << (*it).getName() << "=" << (*it).getValueString(); + } + return os; +} + } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h index e9dc3873c..0ab303cf6 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h @@ -124,19 +124,32 @@ class LIMA_AUTOMATON_EXPORT EntityFeatures: public std::vector { EntityFeatures::iterator findLast(const std::string& featureName); friend LIMA_AUTOMATON_EXPORT std::ostream& operator<<(std::ostream& os, const EntityFeatures& f); + friend LIMA_AUTOMATON_EXPORT QDebug& operator<<(QDebug& os, const EntityFeatures& f); }; +#if defined(WIN32) + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const int& ); + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const double& ); + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const QString& ); +#endif + template void EntityFeatures::setFeature(const std::string& name, const ValueType& value) { - SELOGINIT; - LDEBUG << "EntityFeatures::setFeature(" << name << "," << value << ")"; +// SELOGINIT; +// LDEBUG << "EntityFeatures::setFeature(" << name << "," << value << ")"; // if feature with same name already exists, overwrite it EntityFeatures::iterator it=find(name); if (it!=end()) { // if( (it!=end()) && (name==DEFAULT_ATTRIBUTE) ){ (*it).setValue(boost::any(value)); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::setFeature(" << name << "," << (*it).getValueString() << ")"; +#endif +#endif } else { //push empy feature and set values to avoid two copies @@ -144,18 +157,30 @@ template push_back(EntityFeature()); back().setName(name); back().setValue(boost::any(value)); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::setFeature(" << name << "," << back().getValueString() << ")"; +#endif +#endif } } template void EntityFeatures::addFeature(const std::string& name, const ValueType& value) { - SELOGINIT; - LDEBUG << "EntityFeatures::addFeature(" << name << "," << value << ")"; +// SELOGINIT; +// LDEBUG << "EntityFeatures::addFeature(" << name << "," << value << ")"; push_back(EntityFeature()); back().setName(name); back().setValue(boost::any(value)); - } +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::addFeature(" << name << "," << back().getValueString() << ")"; +#endif +#endif + } /* template void EntityFeatures::appendFeature(const std::string& name, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp index 96d7b6da5..73724e735 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp @@ -47,9 +47,17 @@ m_normalizedForm(0), m_position(entity.positionBegin()), m_length(entity.length()) { +#ifdef ANTINNO_SPECIFIC + LOGINIT("LP::Automaton"); + LDEBUG << "entity.features(): " << entity.features(); +#endif Automaton::EntityFeatures::const_iterator f=entity.features().find(DEFAULT_ATTRIBUTE); if (f!=entity.features().end()) { + #ifdef ANTINNO_SPECIFIC + LOGINIT("LP::Automaton"); + LDEBUG << "entity.features()[\"value\"]: " << boost::any_cast((*f).getValue()); + #endif m_normalizedForm=sp[boost::any_cast((*f).getValue())]; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp index cef11d732..bdebc2581 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include using namespace std; using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; @@ -49,6 +52,9 @@ namespace Automaton { #define DEFAULT_MAXNBRESULTS 50 #define DEFAULT_MAXRESULTSIZE 200 +// a structure to store the position of the search in the automaton +typedef std::pair,const Transition*> DFFSPos; + AutomatonControlParams::AutomatonControlParams(): m_maxDepthStack(DEFAULT_MAXDEPTHSTACK), m_maxTransitionsExplored(DEFAULT_MAXTRANSITIONSEXPLORED), @@ -233,18 +239,26 @@ void Automaton::initializeSearchStructures(MediaId language) { bool Automaton:: getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& vertex, AnalysisContent& analysis, + SearchGraph* searchGraph, const Tstate& state, - std::vector& - matchingTransitions) const { + std::vector& matchingTransitions, + const LinguisticGraphVertex& limit + ) const { Token* token = get(vertex_token, *(graph.getGraph()), vertex); MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), vertex); - if (m_searchStructures[state]==0) { -// AULOGINIT; +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "Automaton::getMatchingTransitions(vertex: " << vertex << ")"; // LDEBUG << "search structure not initialized: linear search"; +#endif + if (m_searchStructures[state]==0) { //linear search on the transitions +#ifdef DEBUG_LP + LDEBUG << "Automaton::getMatchingTransitions: search structure not initialized: linear search"; +#endif matchingTransitions.clear(); vector::const_iterator trans=m_transitions[state].begin(), @@ -252,19 +266,42 @@ getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, for (; trans!=trans_end; trans++) { // LDEBUG << "Automaton::getMatchingTransitions vertex: " << vertex; + deque noVertices; +#ifdef ANTINNO_SPECIFIC + DFFSPos newPair(noVertices,nullptr); +#else + DFFSPos newPair(noVertices,0); +#endif + bool match=(*trans).transitionUnit()->compare(graph,vertex,analysis,token,data); + const GazeteerTransition* gtrans = dynamic_cast((*trans).transitionUnit()); + // TODO: generalize buildNextTermsList and checkMultiTerms to be able to manage backtrack and backward + if( gtrans != 0 ) { + deque vertices; + match = gtrans->matchPath(graph, vertex, limit, searchGraph, analysis, token, vertices, data); + if( match ) { + newPair = DFFSPos(vertices,&(*trans)); + } + } + else { + deque singleton(1,vertex); + newPair = DFFSPos(singleton,&(*trans)); + } if ((*trans).transitionUnit()->negative()) { match = (!match); } if (match) { - matchingTransitions.push_back(&(*trans)); + matchingTransitions.push_back(newPair); } } return (!matchingTransitions.empty()); } else { +#ifdef DEBUG_LP + LDEBUG << "Automaton::getMatchingTransitions: search structure initialized find"; +#endif return m_searchStructures[state]-> - findMatchingTransitions(graph,vertex,analysis,token,data,matchingTransitions); + findMatchingTransitions2(graph,vertex,limit,searchGraph,analysis,token,data,matchingTransitions); } } @@ -311,6 +348,7 @@ operator()(const AutomatonMatch& r1, // internal definition of a utility class: // stack for DFS test function + class Automaton::DFSStack { public: DFSStack(const Automaton& a, @@ -333,24 +371,25 @@ class Automaton::DFSStack { bool isEndVertex(const LinguisticGraphVertex& v) const { return (v==m_searchGraph->endOfGraph(m_graph)); } - std::pair top(); - void popVertex(); + // std::pair top(); + DFFSPos top(); + /* TODO: usefull? + * void popVertex(); + */ bool pop(); bool push(const LinguisticGraphVertex& vertex, const Tstate& state, - AnalysisContent& analysis); + AnalysisContent& analysis, + const LinguisticGraphVertex& limit); private: struct DFSStackElement { - DFSStackElement(LinguisticGraphVertex v, - const std::vector& t): - m_vertex(v), - m_transitions(t), - m_transition(t.begin()) + DFSStackElement( std::vector& matchingTransitions): + m_transitions(matchingTransitions), + m_transition(matchingTransitions.begin()) { } DFSStackElement(const DFSStackElement& elt): - m_vertex(elt.m_vertex), m_transitions(elt.m_transitions), m_transition(m_transitions.begin()) { @@ -358,9 +397,10 @@ class Automaton::DFSStack { ~DFSStackElement() {} - LinguisticGraphVertex m_vertex; - std::vector m_transitions; - std::vector::const_iterator m_transition; + std::vector m_transitions; + //std::vector > m_transitions; + std::vector::const_iterator m_transition; + //std::vector >::const_iterator m_transition; }; std::vector m_stack; const Automaton& m_automaton; @@ -369,16 +409,15 @@ class Automaton::DFSStack { LinguisticGraphVertex m_limit; }; -std::pair -Automaton::DFSStack::top() { +//std::pair +DFFSPos Automaton::DFSStack::top() { // AULOGINIT; // LDEBUG << "Automaton:DFSSTack: top " // << "transition=" << *(m_stack.back().m_transition) // << ";transitionUnit=" // << (*(m_stack.back().m_transition))->transitionUnit() // ; - return make_pair(m_stack.back().m_vertex, - *(m_stack.back().m_transition)); + return *(m_stack.back().m_transition); } bool Automaton::DFSStack::pop() { @@ -395,14 +434,68 @@ bool Automaton::DFSStack::pop() { return false; } -void Automaton::DFSStack::popVertex() { +/* TODO usefull? + * void Automaton::DFSStack::popVertex() { m_stack.pop_back(); } - +*/ +/* + * fill the stack with pairs (nextV,matchingTransition) + * nextV is one of the successor nodes in the graph + * The function look for possible transition from state + * and select matchingTransition = set of transition which succeed with nextV + */ +/* + * Pour remplir la pile, on itére sur les outVertex, + * puis pour chaque vertex, on regarde quelles transitions obtiennent un succès + * Cela ressemble à l'initialisation d'un mode largeur d'abord... + * En fait, c'est simplement pour limiter la taille de la structure de données qui gère le contexte de parcours. + * Le parcours se fait en profondeur d'abord (DFS Deep First Search) + * conforme au nom de la pile DFSStack. + * + * Le parcours se fait en profondeur d'abord sur le graphe d'analyse, limité sur plusieurs aspects: + * - les limites du graphe (begin, end), c'est à dire les noeuds 0 et 1 qui terminent le treillis. + * (si le parcours se fait en avant, limit = end, si le parcours se fait en arière, limit = begin) + * - la profondeur de la pile (pour éviter des traitements trop longs et des dépassements de pile sur + * des textes 'pathologiques', ex: des texts issus de tableaux) + * - le nombre de backtrack??? + * L'unité d'avancement dans ce parcours est le passage d'un noeud à l'un des noeuds successeurs + * dans le graphe d'analyse. De même dans les opérations de backtrack, on revient sur une étape de + * ce parcours. + * Si on souhaite intégrer les transitions de type GazetteerTransition, il faut pouvoir + * gérer une unité d'avancement différente: il faut envisager l'avancement sur plusieurs noeuds + * successifs du graphe lorsqu'il y a un match d'un élément multi-terme du gazetteer. De même le + * backtrack doit se faire jusqu'au point d'avancement précédent donc revenir en arrière sur + * plusieurs noeuds. + * Une pile sert à gérer le point d'avancement dans le parcours. + * Actuellement, pour remplir la pile, on itére sur les 'out vertex' puis pour chaque vertex, on regarde + * quelles transitions obtiennent un succès. Cela ne convient plus car on ne couvre pas le cas des noeuds + * atteints par les éléments multi-termes des gazeteer. + * En effet, pour une paire (out vertex, transition) qui décrit une possibilité d'avancement, l'exécution de + * la transition va nous faire avancer au delà du noeud 'out vertex' dans le cas des multi-terme. + * Toutes les transitions ne font pas atteindre le même noeud. + * On est donc obligé de modifier la structure de données de la pile qui gére le contexte de parcours et le + * backtrack. + * Changement: + + * On modifie seulement Automaton::getMatchingTransitions et la structure Automaton::DFSStack. + * On considère que nextVertex est la direction dans laquelle on va, mais la transition peut mener plus loin. + * On modifie DFSStackElement de la façon suivante: + * DFSStackElement contenait un noeud (out vertex) et une collection (vector) de transitions possibles + * DFSStackElement contient maintenant une collection (vector) de paires (séquence de noeud parcourus pendant la transition, transition possible) + * (stack, transition), ainsi qu'un itérateur sur cette liste. + * stack est le chemin dans le graphe (commençant par nextVertex) correspondant à l'exécution de la transition. + * + * Attention aux paramètres begin,end de la fonction checkMultiTerms + * La fonction checkMultiTerms a été écrite pour avec les limitations suivantes: sens forward seulement, pas de + * prise en compte de multiples arêtes à partir d'un noeud. + * + */ bool Automaton::DFSStack:: push(const LinguisticGraphVertex& vertex, const Tstate& state, - AnalysisContent& analysis) { + AnalysisContent& analysis, + const LinguisticGraphVertex& limit) { /* AULOGINIT; LDEBUG << "Automaton:DFSSTack: pushing " << vertex @@ -425,12 +518,13 @@ push(const LinguisticGraphVertex& vertex, LinguisticGraphVertex nextVertex; while (m_searchGraph->getNextVertex(m_graph.getGraph(),nextVertex)) { if (! isEndVertex(nextVertex)) { - std::vector matchingTransitions(0); + std::vector matchingTransitions(0); // LDEBUG << "Automaton:get matching transitions from state " // << state << " for vertex " << nextVertex; if (m_automaton. getMatchingTransitions(m_graph,nextVertex,analysis, - state,matchingTransitions)) { + m_searchGraph,state,matchingTransitions,limit)) { + /* if (logger.isDebugEnabled()) { ostringstream oss; std::vector::const_iterator @@ -442,7 +536,7 @@ push(const LinguisticGraphVertex& vertex, } LDEBUG << oss.str(); }*/ - tmpStack.push_back(DFSStackElement(nextVertex,matchingTransitions)); + tmpStack.push_back(DFSStackElement(matchingTransitions)); } /* else { LDEBUG << "Automaton:DFSSTack: => no matching transitions" @@ -526,7 +620,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, &forward, limit); success = testFromState(initialState, graph, - begin, analysis, + begin, limit, analysis, results, checkList, forwardSearchStack, @@ -541,7 +635,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, &backward, limit); success = testFromState(initialState, graph, - begin, analysis, + begin, limit, analysis, results, checkList, backwardSearchStack, @@ -557,6 +651,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, bool Automaton::testFromState(const Tstate firstState, const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& beginVertex, + const LinguisticGraphVertex& limitVertex, AnalysisContent& analysis, AutomatonMatchSet& results, ConstraintCheckList& checkList, @@ -569,7 +664,7 @@ bool Automaton::testFromState(const Tstate firstState, // store in stack pairs of (automaton transition/graph vertex) // (store combinatory of all possible pairs, but if store only // matching pairs, problems with ConstraintCheckList - + RecognizerMatch currentMatch(&graph); // check initial state @@ -582,16 +677,18 @@ bool Automaton::testFromState(const Tstate firstState, return (!results.empty()); } - // begin is the vertex that matched the trigger: - // push following vertices + // beginVertex is the vertex that matched the trigger + // initialize the stack with pairs (stack of vertex with nextV as first element,matchingTransition) + // nextV is one of the successor nodes in the graph and matchingTransition(nextV) succeeds // LDEBUG << "pushing"; - S.push(beginVertex,firstState,analysis); + S.push(beginVertex,firstState,analysis,limitVertex); LinguisticGraphVertex vertex; const Transition* transition(0); uint64_t nbIter(0); bool backtrack(false); + // contexte de backtrack vector backtrackDepth; backtrackDepth.push_back(0); @@ -602,18 +699,19 @@ bool Automaton::testFromState(const Tstate firstState, // LDEBUG << "in iteration " << nbIter; if (S.size() > controlParams.getMaxDepthStack()) { AULOGINIT; - LWARN << "MaxDepthStack exceeded in automaton search: ignore rest of search" - ; + LWARN << "MaxDepthStack exceeded in automaton search: ignore rest of search"; return (!results.empty()); } if (nbIter > controlParams.getMaxTransitionsExplored()) { AULOGINIT; - LWARN << "MaxTransitionsExplored exceeded in automaton search: ignore rest of search" - ; + LWARN << "MaxTransitionsExplored exceeded in automaton search: ignore rest of search"; return (!results.empty()); } - boost::tie(vertex,transition)=S.top(); + // boost::tie(vertex,transition)=S.top(); + DFFSPos const & dffsPos = S.top(); + vertex = dffsPos.first.front(); + transition = dffsPos.second; if (backtrack) { // in backtrack : pop_back current match until the vertex // for which we are testing a new matching transition @@ -658,12 +756,17 @@ bool Automaton::testFromState(const Tstate firstState, // } //if (trans->match(graph,vertex,analysis,checkList)) { + // TODO: call checkConstraints for every vertex in the deque? if (trans->checkConstraints(graph,vertex,analysis,checkList)) { // LDEBUG << "Automaton: -> match found"; // update current match LimaString transId = LimaString::fromUtf8( trans->getId().c_str() ); - currentMatch.addBackVertex(vertex,trans->keep(), transId); + // OME: call for the complete stack currentMatch.addBackVertex(vertex,trans->keep(), transId); + std::deque::const_iterator vIt = dffsPos.first.begin(); + for( ; vIt != dffsPos.first.end() ; vIt++ ) { + currentMatch.addBackVertex(*vIt,trans->keep(), transId); + } /* LDEBUG << "Automaton: -> vertex (" << vertex << ",keep=" << trans->keep() << ") added in result, currentMatch=" @@ -717,7 +820,7 @@ bool Automaton::testFromState(const Tstate firstState, } // push next vertices - if (!S.push(vertex,nextState,analysis)) { + if (!S.push(vertex,nextState,analysis,limitVertex)) { backtrack=true; } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h index fc24f82ff..a6e12e728 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h @@ -35,6 +35,9 @@ #include "AutomatonExport.h" #include "transitionUnit.h" #include "searchGraph.h" +#ifdef ANTINNO_SPECIFIC +#include "gazeteerTransition.h" +#endif #include "transition.h" #include "transitionSearchStructure.h" #include "recognizerMatch.h" @@ -341,9 +344,10 @@ friend class AutomatonWriter; bool getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& vertex, AnalysisContent& analysis, + SearchGraph* searchGraph, const Tstate& state, - std::vector& - matchingTransitions) const; + std::vector,const Transition*> >& matchingTransitions, + const LinguisticGraphVertex& limit) const; protected: Tstate m_numberStates; /**< number of states in the automaton */ @@ -366,6 +370,7 @@ friend class AutomatonWriter; bool testFromState(const Tstate firstState, const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& limit, AnalysisContent& analysis, AutomatonMatchSet& results, ConstraintCheckList& checkList, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp index 9d1fb9a95..7e70908c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp @@ -59,6 +59,37 @@ void writeTword(std::ofstream& file,const Tword& s,const FsaStringsPool& sp) Misc::writeUTF8StringField(file,sp[s]); } +// LimaString type +void readLimaString(std::ifstream& file, LimaString& s) +{ + Misc::readUTF8StringField(file,s); +} +void writeLimaString(std::ofstream& file,const LimaString& s) +{ + Misc::writeUTF8StringField(file,s); +} + +// wordSet = set of multi-term +void readWordVector(std::ifstream& file, std::vector& wordVector) +{ + int i = Misc::readCodedInt(file); + for( ; i > 0 ; i-- ) { + LimaString s; + Misc::readUTF8StringField(file,s); + wordVector.push_back(s); + } +} + +void writeWordSet(std::ofstream& file,const std::set& wordSet) +{ + int i = wordSet.size(); + Misc::writeCodedInt(file,i); + std::set::const_iterator wordIt = wordSet.begin(); + for( ; wordIt != wordSet.end() ; wordIt++ ) { + Misc::writeUTF8StringField(file,*wordIt); + } +} + //---------------------------------------------------------------------- // Part-of-speech type void readTpos(std::ifstream& file, Tpos& p) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h index d1d20f438..7abf6f758 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h @@ -42,6 +42,8 @@ #include #include #include +#include +#include namespace Lima { namespace LinguisticProcessing { @@ -70,6 +72,10 @@ void writeTword(std::ofstream& file,const Tword& s,const FsaStringsPool& sp); void readTpos(std::ifstream&, Tpos&); void writeTpos(std::ofstream&,const Tpos&); +// reading and writing set of words (for gazeteer) +void readWordVector(std::ifstream& file, std::vector& wordVector); +void writeWordSet(std::ofstream& file,const std::set& wordSet); + //comparing the part-of-speech type with a LingPropertyEntry // should take a const LingPropertyEntry& argument // check with JYS diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp index 3ffc837c9..7c0fb4e48 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp @@ -46,6 +46,7 @@ #include "setTransition.h" #include "deaccentuatedTransition.h" #include "entityTransition.h" +#include "entityGroupTransition.h" #include "common/Data/readwritetools.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/TStatus.h" @@ -71,7 +72,12 @@ void writeTypeTransition(std::ofstream& file, const TypeTransition t) { } +#ifdef ANTINNO_SPECIFIC #define RECOGNIZER_VERSION "1.20" +#else +#define RECOGNIZER_VERSION "1.30" +#endif + #define RECOGNIZER_DEBUG_VERSION ".debug" //---------------------------------------------------------------------- @@ -396,6 +402,18 @@ readTransitionUnit(std::ifstream& file,MediaId language) t=new TStatusTransition(status); break; } + case T_GAZETEER: { + // read alias + LimaString alias; + Misc::readUTF8StringField(file,alias); + // read set of words + std::vector wordVector; + readWordVector(file,wordVector); + // read keep + int keepVal = Misc::readCodedInt(file); + // create transition + t=new GazeteerTransition(wordVector, alias, keepVal == 1); + break; } case T_AND: { uint64_t size=Misc::readCodedInt(file); vector tmp(size); @@ -441,6 +459,12 @@ readTransitionUnit(std::ifstream& file,MediaId language) t=new EntityTransition(m_entityTypeMapping[EntityType(typeId,groupId)]); break; } + case T_ENTITY_GROUP: { + EntityGroupId groupId=static_cast(Misc::readCodedInt(file)); + // use entityGroup mapping + t=new EntityGroupTransition(m_entityGroupMapping[groupId]); + break; + } default: { AULOGINIT; LERROR << "Undefined type of transition: " << codeTrans; @@ -460,6 +484,7 @@ readTransitionUnit(std::ifstream& file,MediaId language) char *buf = new char [len]; file.read(buf, len); t->setId(std::string(buf,len)); + delete[] buf; uint64_t n=Misc::readCodedInt(file); Constraint c; for (uint64_t i(0); ipartOfSpeech()); break; } + case T_GAZETEER: { + GazeteerTransition* t=static_cast(transition); + Misc::writeUTF8StringField(file,t->alias()); + writeWordSet(file,t->wordSet()); + if( t->keep() ) + Misc::writeCodedInt(file,1); + else + Misc::writeCodedInt(file,0); + break; + } case T_NUM: { NumericTransition* t=static_cast(transition); Misc::writeCodedInt(file,t->value()); @@ -725,6 +760,12 @@ writeTransitionUnit(std::ofstream& file, file.write((char*) &lang,sizeof(unsigned char)); break; } + case T_ENTITY_GROUP: { + EntityGroupTransition* t=static_cast(transition); + EntityGroupId entityGroupId=t->entityGroupId(); + Misc::writeCodedInt(file,entityGroupId); + break; + } case T_ENTITY: { EntityTransition* t=static_cast(transition); EntityType entityType=t->entityType(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp new file mode 100644 index 000000000..46a36faa3 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp @@ -0,0 +1,130 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************* + * + * @file entityGroupTransition.cpp + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Mon oct 5 2015 + * copyright (c) 2006-2015 by CEA + * + *************************************************************************/ + + +#include "entityGroupTransition.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +/***********************************************************************/ +// initialization of static members +LimaString EntityGroupTransition::m_entityAnnotation=Common::Misc::utf8stdstring2limastring("SpecificEntity"); + +/***********************************************************************/ +// constructors +/***********************************************************************/ +EntityGroupTransition::EntityGroupTransition(): +TransitionUnit(), +m_entityGroupId() +{ +} + +EntityGroupTransition::EntityGroupTransition(Common::MediaticData::EntityGroupId groupId, bool keep): +TransitionUnit(keep), +m_entityGroupId(groupId) +{ +} + +EntityGroupTransition::~EntityGroupTransition() {} + +std::string EntityGroupTransition::printValue() const { + ostringstream oss; + oss << "ENTITY_GROUP_" << m_entityGroupId; + return oss.str(); +} + +/***********************************************************************/ +// operators == +/***********************************************************************/ +bool EntityGroupTransition::operator== (const TransitionUnit& tright) const { + if ( (type() == tright.type()) + && (m_entityGroupId == static_cast(tright).entityGroupId()) + ) { + return true; + } + else { + return false; + } +} + +bool EntityGroupTransition:: +compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* /*token*/, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + // should compare to vertex ? + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) { + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: no annotation graph available !"; + return false; + } + + // find annotationGraphVertex matching the vertex of the current graph + std::set matches = annotationData->matches(graph.getGraphId(), v, "annot"); + if (matches.empty()) + { + AULOGINIT; + LDEBUG << "annotation ("<hasAnnotation(annotVertex, m_entityAnnotation)) + { + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: No " << m_entityAnnotation << " annotation available on " << v; + return false; + } + + const SpecificEntityAnnotation* se = + annotationData->annotation(annotVertex, m_entityAnnotation). + pointerValue(); + Common::MediaticData::EntityType type = se->getType(); + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: type = " << type << ", groupId = " << type.getGroupId(); + LDEBUG << "EntityGroupTransition::compare: m_entityGroupId = " << m_entityGroupId; + LDEBUG << "EntityGroupTransition::compare: tests m_entityGroupId == type.getGroupId() = " << (m_entityGroupId == type.getGroupId()); + return( m_entityGroupId == type.getGroupId() ); +} + +} // namespace end +} // namespace end +} // namespace end diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h new file mode 100644 index 000000000..170105cc4 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h @@ -0,0 +1,86 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file EntityGroupTransition.h + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Mon oct 5 2015 + * copyright (c) 2006-2015 by CEA + * Project Automaton + * + * @brief transitions that are previously recognized entities + * + ***********************************************************************/ + +#ifndef ENTITYGROUPTRANSITION_H +#define ENTITYGROUPTRANSITION_H + +#include "AutomatonExport.h" +#include "automatonCommon.h" +#include "transitionUnit.h" + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +class LIMA_AUTOMATON_EXPORT EntityGroupTransition : public TransitionUnit +{ + public: + EntityGroupTransition(); + EntityGroupTransition(Common::MediaticData::EntityGroupId, bool keep=true); + virtual ~EntityGroupTransition(); + + EntityGroupTransition* clone() const; + EntityGroupTransition* create() const; + + std::string printValue() const; + bool operator== (const TransitionUnit&) const; + + bool compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data) const; + + TypeTransition type() const; + Common::MediaticData::EntityGroupId entityGroupId() const { return m_entityGroupId; } + void setEntityGroupId(Common::MediaticData::EntityGroupId groupId) { m_entityGroupId=groupId; } + + private: + Common::MediaticData::EntityGroupId m_entityGroupId; + static LimaString m_entityAnnotation; +}; + + +/***********************************************************************/ +// inline access functions +/***********************************************************************/ +inline TypeTransition EntityGroupTransition::type() const { return T_ENTITY_GROUP; } + +inline EntityGroupTransition* EntityGroupTransition::clone() const { + return new EntityGroupTransition(*this); } +inline EntityGroupTransition* EntityGroupTransition::create() const { + return new EntityGroupTransition(); } + + +} // namespace end +} // namespace end +} // namespace end + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp new file mode 100644 index 000000000..1c777c887 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp @@ -0,0 +1,376 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************* +* +* File : gazeteerTransition.cpp +* Author : Olivier Mesnard (olivier.mesnard@cea.fr) +* @date Thu August 04 2015 +* copyright Copyright (C) 2002-2015 by CEA LIST +* Version : $Id$ +* +*************************************************************************/ + + +#include "gazeteerTransition.h" +#include "common/MediaticData/mediaticData.h" +#include +#include // for tie +#include "searchGraph.h" + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::MediaticData; + + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +/***********************************************************************/ +// constructors +/***********************************************************************/ +GazeteerTransition::GazeteerTransition(): +TransitionUnit(), +m_wordSet(), +m_alias() +{ +} + +GazeteerTransition::GazeteerTransition(const std::vector& wordSet, const LimaString& alias, bool keep): +TransitionUnit(keep), +m_wordSet(wordSet.begin(),wordSet.end()), +m_alias(alias) +{ +} + +GazeteerTransition::GazeteerTransition(const GazeteerTransition& t): +TransitionUnit(t), +m_wordSet(t.m_wordSet), +m_alias(t.m_alias) +{ +// TODO ToBeDeleted ? + // copyProperties(t); +} + +GazeteerTransition::~GazeteerTransition() {} + +GazeteerTransition& GazeteerTransition::operator = (const GazeteerTransition& t) { + if (this != &t) { + m_alias = t.alias(); + copyProperties(t); + } + return *this; +} + + +std::string GazeteerTransition::printValue() const { + ostringstream oss; + oss << "alias:" << Lima::Common::Misc::limastring2utf8stdstring(m_alias); + std::set::const_iterator it = m_wordSet.begin(); + if( it != m_wordSet.end() ) { + const Lima::LimaString & word = *it; + oss << "(" << Lima::Common::Misc::limastring2utf8stdstring(word); + } + for( it++ ; it != m_wordSet.end(); it++ ) { + const Lima::LimaString & word = *it; + oss << "," << Lima::Common::Misc::limastring2utf8stdstring(word); + } + oss << ")"; + return oss.str(); +} + +/***********************************************************************/ +// operators == +/***********************************************************************/ +bool GazeteerTransition::operator== (const TransitionUnit& tright) const { + if ( (type() == tright.type()) + && (m_alias == static_cast(tright).alias()) + ) { + return compareProperties(tright); + } + else { + return false; + } +} + +bool GazeteerTransition:: +compare(const LinguisticAnalysisStructure::AnalysisGraph& /*graph*/, + const LinguisticGraphVertex& /*vertex*/, + AnalysisContent& /*analysis*/, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + //AULOGINIT; +// LDEBUG << "GazeteerTransition compare " << Common::MediaticData::MediaticData::changeable().stringsPool()[token->form()] << " and " << Common::MediaticData::MediaticData::changeable().stringsPool()[m_word]; + QString form(token->stringForm()); + std::set::const_iterator it = m_wordSet.lower_bound(form); + if( it == m_wordSet.end() ) { + return false; + } + QString element = *it; + // If element is equal to form + if( element == form ) + { + return true; + } + // Or element begin with form followed by a space character + if( element.startsWith(form) ) + { + if( element.at(form.length()) == ' ') + { + return true; + } + } + /* + QString pattern(form); + pattern.append("\\b"); + QRegExp rx(pattern); + int index = qStringList.indexOf(rx); + */ +// return true; + return false; +} + +bool GazeteerTransition:: +matchPath(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + deque& vertices, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + // TODO: use of limit??? +#ifdef DEBUG_LP + AULOGINIT; +#endif + const LimaString firstSimpleTerm = token->stringForm(); + /* build multi term list in gazeteer with firstSimpleTerm as first term */ + std::vector > additionalMultiTermList; + buildNextTermsList( firstSimpleTerm, additionalMultiTermList ); + /* follow graph if tokens match other terms */ + std::stack,std::vector > > triggerMatches; + checkMultiTerms(graph, vertex, limit, searchGraph, analysis, additionalMultiTermList, triggerMatches ); + if( triggerMatches.empty() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::match: trans of type gazeteerTransition selected but no match"; +#endif + return false; + } + else { + vertices = triggerMatches.top(); + return true; + } + return false; +} + + /* Gazeteer may contains multi-term elements like */ +/* "managing director","Managing Director","managing editor","managing comitee secretary"... */ +/* From wordSet, we build a list of multiple terms, each with parameter firstSimpleTerm as first simple term */ +/* [("managing,director");("managing,Director");("managing,editor");("managing,comitee,secretary")] */ +/* return false if there is no elements begining with "managing" */ +bool GazeteerTransition:: +buildNextTermsList( const LimaString& firstSimpleTerm, std::vector >& multiTermList ) const +{ +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "GazeteerTransition::buildNextTermsList(" << firstSimpleTerm << ")"; +#endif + + // Fill list of list of additional simple terms from list of elements + std::set::const_iterator it = m_wordSet.lower_bound(firstSimpleTerm); + if( it == m_wordSet.end() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: Error: first term not found"; +#endif + return false; + } + for( ; it != m_wordSet.end() ; it++ ) + { + LimaString element = *it; +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: Examining " << element.toStdString(); +#endif + // if element does not start with firstSimpleTerm, there no more possible match + if( !element.startsWith(firstSimpleTerm) ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: stop it!: first term not found"; +#endif + break; + } + std::vector multiTerm; + // if element equals the token, we push a vector with a unique element, and go to the next element + if( element == firstSimpleTerm ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back in multiTermList singleton " << firstSimpleTerm.toStdString(); +#endif + multiTerm.push_back(firstSimpleTerm); + multiTermList.push_back(multiTerm); + continue; + } + // within element, if firstSimpleTerm is not followed by others simple terms separated with space + // first term is only a prefix and does not match exactly firstSimpleTerm, go to the next element + int pos(0); + int index = element.indexOf(' ', pos); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: pos = " << pos << ", index=" << index; +#endif + if( index != firstSimpleTerm.length() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: no second term for " << element.toStdString(); +#endif + continue; + } + else { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back in multiterm " << firstSimpleTerm.toStdString(); +#endif + multiTerm.push_back(firstSimpleTerm); + } + // build list of elements following firstSimpleTerm + for( ; ; ) { + pos = index+1; + index = element.indexOf(' ', pos); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: pos = " << pos << ", index=" << index; +#endif + if( index == -1 ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back last term " << element.mid(pos).toStdString(); +#endif + multiTerm.push_back(element.mid(pos)); + break; + } + else + { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: add term " << element.mid(pos,index-pos).toStdString(); +#endif + multiTerm.push_back(element.mid(pos,index-pos)); + } + } +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back list of " << multiTerm.size() << " elements"; +#endif + multiTermList.push_back(multiTerm); + } + return( multiTermList.size() > 0 ); +} + +bool GazeteerTransition:: +checkMultiTerms( const AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& limit, + Lima::LinguisticProcessing::Automaton::SearchGraph* searchGraph, + Lima::AnalysisContent& analysis, const vector< vector< Lima::LimaString > >& additionalMultiTermList, + stack< deque< LinguisticGraphVertex >, vector< deque< LinguisticGraphVertex > > >& matches + ) const { + + +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "GazeteerTransition::checkMultiTerms( from " << position << ")"; +#endif + // Iteration on multi-terms from gazeteer whose first term matches current token + std::vector >::const_iterator multiTermsIt = additionalMultiTermList.begin(); + const LinguisticGraph* lGraph = graph.getGraph(); + for( ; multiTermsIt != additionalMultiTermList.end() ; multiTermsIt++ ) { + // iterator for simpleterms + std::vector::const_iterator termsIt = (*multiTermsIt).begin(); + std::vector::const_iterator termsIt_end = (*multiTermsIt).end(); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: check multi-term (" + << *termsIt << " and " << (*multiTermsIt).size()-1 << " more...)"; +#endif + // For each list of simple Terms, we make a deep first search in the graph + // searchPos stores a stack of position in the graph to perform the deep first search + // the completed path is stored in a deque of vertices (initialized with position) + std::deque triggerMatch; + triggerMatch.push_back(position); + termsIt++; + // init search from position + searchGraph->findNextVertices(lGraph, position); + // init current position + LinguisticGraphVertex nextVertex = position; + // if list is not exhausted + + // case of empty list of simple term + if(termsIt == termsIt_end ) { + // Error! +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: list of simple terms is a singleton!"; +#endif + matches.push(triggerMatch); + //matches.push(triggerMatch); + } + else { + // go one step ahead from curentPosition if possible + while ( searchGraph->getNextVertex(lGraph, nextVertex )) { + const LinguisticGraphVertex& firstVertex = graph.firstVertex(), + lastVertex = graph.lastVertex(); + if (nextVertex == lastVertex || nextVertex == firstVertex) +// return false; + break; +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: progress one step forward, nextVertex=" << nextVertex; + LDEBUG << "GazeteerTransition::checkMultiTerms: test " << *termsIt; +#endif + // test currentVertex + Token* token = get(vertex_token, *lGraph, nextVertex); + LimaString form(token->stringForm()); + if( form == *termsIt ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: match with " << *termsIt; +#endif + // If match, push vertex in triggerMatch and initialize next step + // Push out_edge is a better if we have to follow the path from the begining ??? + triggerMatch.push_back(nextVertex); + // stack next step to continue the search + searchGraph->findNextVertices(lGraph, nextVertex); + termsIt++; + if(termsIt == termsIt_end ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: list of simple terms exhausted!"; +#endif + // list of Simple term exhausted: success + // we push the path in the aGraph as a solution of triggerMatch + // Only if size of solution is greater than previous one !! + if( matches.empty() || (triggerMatch.size() > matches.top().size()) ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: push (in matches) a deque of size " << triggerMatch.size(); +#endif + matches.push(triggerMatch); + } + // no need to go forward + break; + } + // else we do not stack next steps, we obtain a cut + } + } + } + } + if( matches.empty() ) + return false; + return true; +} + +} // namespace end +} // namespace end +} // namespace end diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h new file mode 100644 index 000000000..9612c05bf --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h @@ -0,0 +1,112 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file gazeteerTransition.h + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Thu August 04 2015 + * copyright Copyright (C) 2002-2015 by CEA LIST + * Project Automaton + * + * @brief transitions that are surface form : belongs to a gazeteer + * + ***********************************************************************/ + +#ifndef GAZETEERTRANSITION_H +#define GAZETEERTRANSITION_H + +#include "AutomatonExport.h" +#include "automatonCommon.h" +#include "transitionUnit.h" +#include +#include "searchGraph.h" + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +class LIMA_AUTOMATON_EXPORT GazeteerTransition : public TransitionUnit +{ + public: + GazeteerTransition(); + GazeteerTransition(const std::vector& wordSet, const LimaString& alias, bool keep=true); + GazeteerTransition(const GazeteerTransition&); + virtual ~GazeteerTransition(); + GazeteerTransition& operator = (const GazeteerTransition&); + + GazeteerTransition* clone() const; + GazeteerTransition* create() const; + + std::string printValue() const; + bool operator== (const TransitionUnit&) const; + + bool compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data) const; + + TypeTransition type() const; + LimaString alias() const; + const std::set& wordSet() const; + + bool matchPath(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + std::deque& vertices, + const LinguisticAnalysisStructure::MorphoSyntacticData* ) const; + + private: + bool checkMultiTerms( const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const std::vector >& additionalMultiTermList, + std::stack,std::vector > >& matches + ) const; + + bool buildNextTermsList( const LimaString& firstSimpleTerm, std::vector >& multiTermList ) const; + std::set m_wordSet; + LimaString m_alias; + +}; + + +/***********************************************************************/ +// inline access functions +/***********************************************************************/ +inline const std::set& GazeteerTransition::wordSet() const { return m_wordSet; } +inline TypeTransition GazeteerTransition::type() const { return T_GAZETEER; } + +inline GazeteerTransition* GazeteerTransition::clone() const { + return new GazeteerTransition(*this); } +inline GazeteerTransition* GazeteerTransition::create() const { + return new GazeteerTransition(); } +inline LimaString GazeteerTransition::alias() const { return m_alias; } + + +} // namespace end +} // namespace end +} // namespace end + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp index 5d6352f68..58bb4624c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp @@ -1,1075 +1,1142 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/************************************************************************ -* -* File : recognizer.cpp -* Author : Romaric Besancon (besanconr@zoe.cea.fr) -* Created on : Tue Oct 15 2002 -* Copyright : (c) 2002 by CEA -* -************************************************************************/ - -#include "recognizer.h" - -#include "transitionSearchStructure.h" -#include "automatonCommon.h" -#include "transitionUnit.h" -#include "recognizerData.h" -#include "common/Data/LimaString.h" -#include "common/MediaticData/EntityType.h" -#include "common/MediaticData/mediaticData.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include -#include -#include -#include -#include -#include -#include - -using namespace std; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::ApplyRecognizer; - -namespace Lima { -namespace LinguisticProcessing { -namespace Automaton { - -// a comparison operator on Rule pointer: -// to sort SetOfRules on decreasing rule weights -class CompareRulePtr { -public: - bool operator()(Rule* r1,Rule* r2) { - return (r1->getWeight() > r2->getWeight()); - } -}; - - -// a comparison operator on TriggerRule -class Recognizer::CompareTriggerRule { -public: - bool operator()(const Recognizer::TriggerRule* r1, - const Recognizer::TriggerRule* r2) { - return (r1->setOfRules().front()->getWeight() > - r2->setOfRules().front()->getWeight()); - } -}; - - -/** recognizer factory */ -SimpleFactory recognizerFactory(RECOGNIZER_CLASSID); - -//********************************************************************** -// constructors -//********************************************************************** -Recognizer::Recognizer(): - AbstractResource(), - m_rules(0), - m_ruleStorage(0), - m_language(), - m_automatonControlParams(), - m_filename(), - m_searchStructure() -{ } - -// copy is complex because of the pointers -Recognizer::Recognizer(const Recognizer& r): -AbstractResource(r) -{ - init(); - copy(r); - - // have to initialize the search structure of the new recognizer - initializeSearchStructure(); -} - -//********************************************************************** -// destructor -//********************************************************************** -Recognizer::~Recognizer() -{ - freeMem(); - clearSearchStructure(); -} - -//********************************************************************** -// copy -//********************************************************************** -Recognizer& Recognizer::operator = (const Recognizer& r) -{ - if (this != &r) - { - freeMem(); - init(); - copy(r); - } - - // do not copy the search structure : recompute it the new recognizer - // (not sure the copy is less complex than recomputing it) - initializeSearchStructure(); - - return (*this); -} - -void Recognizer::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - - /** @addtogroup ResourceConfiguration - * - <group name="..." class="AutomatonRecognizer"> - * -  rules : file containing the compiled rules of the recognizer - * -  maxDepthStack : maximum size of stack in depth-first-search - * when testing a rule (default is 100) - * -  maxTransitionsExplored : max number of transitions explored - * when testing a rule (default is 1000) - * -  maxNbResults : max number of results temporarily stored - * when testing a rule (default is 50) - * -  maxResultSize : max size of a result for a rule match - * (this parameter can be seen as the effective size of - * "n" when using {0-n} in a rule) (default is 200) - */ - - m_language=manager->getInitializationParameters().language; - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - try - { - string rulesFile = unitConfiguration.getParamsValueAtKey("rules"); - if (rulesFile != "") - { - m_filename=rulesFile; - rulesFile = resourcesPath + "/" + rulesFile; -// LDEBUG << "read recognizer from file : " << rulesFile; - //readFromFile(rulesFile); - AutomatonReader reader; - reader.readRecognizer(rulesFile,*this); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - AULOGINIT; - LERROR << "No param 'rules' in recognizer group for language " << (int)m_language; - throw InvalidConfiguration(); - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxDepthStack"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxDepthStack is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxDepthStack(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxTransitionsExplored"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxTransitionsExplored is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxTransitionsExplored(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxNbResults"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxNbResults is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxNbResults(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxResultSize"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxResultSize is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxResultSize(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - Common::MediaticData::MediaticData::changeable().stringsPool(m_language).endResourcesPool(); -} - -//********************************************************************** -// helper functions for constructors and destructors -//********************************************************************** -void Recognizer::init() -{ - m_rules.clear(); - m_ruleStorage.clear(); - m_language=UNDEFLANG; - m_automatonControlParams=AutomatonControlParams(); -} - -void Recognizer::copy(const Recognizer& r) -{ - map pointersMap; - - for (uint64_t i(0); iclone(); - m_rules.push_back(TriggerRule(t,SetOfRules(0))); - for (uint64_t j(0); j results; - if (testSetOfRules(*(m_rules[offset].first), - m_rules[offset].second, - graph, - current, - graph.firstVertex(), - graph.lastVertex(), - analysis, - results)) - { - result=results.front(); // only one result because stopAtFirstSuccess=true - return true; - } - return false; -} -*/ - -//********************************************************************** -// test a set of rules for a trigger -uint64_t Recognizer::testSetOfRules(const TransitionUnit& trigger, - const SetOfRules& rules, - const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& position, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - vector& matches, - std::set* forbiddenTypes, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool applySameRuleWhileSuccess) const { - RecognizerMatch leftmatch(&graph); - RecognizerMatch rightmatch(&graph); - - if (onlyOneSuccessPerType && forbiddenTypes==0) { - AULOGINIT; - LERROR << "cannot use onlyOneSuccessPerType " - << "when forbidden types are not allowed"; - onlyOneSuccessPerType=false; - } - - - uint64_t nbSuccess(0); - - // left context is same LinguisticAnalysisStructure::AnalysisGraph as current (current is in fact - // between the current token and the previous one) - LinguisticGraphVertex left=position; - LinguisticGraphVertex right=position; - //LinguisticGraphVertex right=position.forward(); - -#ifdef DEBUG_LP - AULOGINIT; - LDEBUG << "testing set of rules triggered by " << trigger << " on vertex " << position; - LDEBUG << "onlyOneSuccessPerType=" << onlyOneSuccessPerType; - if (logger.isDebugEnabled()) { - std::ostringstream oss; - for (SetOfRules::const_iterator it=rules.begin(),it_end=rules.end();it!=it_end;it++) { - oss << " - " << (*it)->getWeight(); - } - LDEBUG << "Rule weights" << oss.str(); - } -#endif - - bool reapplySameRule(false); - - SetOfRules::const_iterator - rule=rules.begin(), - rule_end=rules.end(); - for (; rule!=rule_end; rule++) { - Rule* currentRule=*rule; - -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LDEBUG << "testing rule "<<*currentRule << "," << currentRule->getRuleId() <<" of type " - << currentRule->getType() << ",reapply=" - << reapplySameRule << " from " << position; - } -#endif - - if (forbiddenTypes && - forbiddenTypes->find(currentRule->getType()) - != forbiddenTypes->end()) { - // type previously forbidden by a negative rule -/* LDEBUG << "type " << currentRule->getType() - << " is forbidden: continue";*/ - continue; - } - - // initializes the constraint checklist - ConstraintCheckList - constraintCheckList(currentRule->numberOfConstraints(), - ConstraintCheckListElement(graph)); - - // treat the constraints for the trigger with the constraint - // checklist corresponding to this rule - //Token* token=get(vertex_token,*(graph.getGraph()),position); -// LDEBUG << "Recognizer: checking trigger constraints: "; - - if (!trigger.checkConstraints(graph,position,analysis, - constraintCheckList)) { - // one unary constraint was not verified -// LDEBUG << "one unary constraint on trigger not verified"; - - // apply actions (for actions triggered by failure) - if (!currentRule->negative()) { - currentRule->executeActions(graph, analysis, - constraintCheckList, - false, - 0); // match is not used -// LDEBUG << "actionSuccess=" << actionSuccess; - } - continue; - } - - leftmatch.reinit(); - rightmatch.reinit(); - ForwardSearch forward; - BackwardSearch backward; - bool success = currentRule->test(graph, left, right, - begin, end, analysis, - leftmatch, rightmatch, - constraintCheckList,forward,backward, - m_automatonControlParams); - //LDEBUG << "success=" << success; - - RecognizerMatch* match=0; - - if (success) { - // build complete match - - match=new RecognizerMatch(leftmatch); - match->addBackVertex(position,trigger.keep(), "trigger"); - match->addBack(rightmatch); - // remove elements not kept at begin and end of the expression - match->removeUnkeptAtExtremity(); - - // check if trigger is head - if (trigger.head()) { - match->setHead(position); - } - match->setType(currentRule->getType()); - match->setLinguisticProperties(currentRule->getLinguisticProperties()); - match->setContextual(currentRule->contextual()); - setNormalizedForm(currentRule->getNormalizedForm(),*match); - } - - // execute possible actions associated to the rule iff current rule is - // positive - //LDEBUG << "Recognizer: executing actions: "; - bool actionSuccess = true; - if (!currentRule->negative()) { - // std::cerr << "execute rule " << currentRule->getRuleId() << " of type " - // << currentRule->getType() << " on vertex " << position << std::endl; - actionSuccess = currentRule->executeActions(graph, analysis, - constraintCheckList, - success, - match); - //LDEBUG << "actionSuccess=" << actionSuccess; - } - -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LinguisticGraphVertex v=position; - LimaString str(""); - Token* token=get(vertex_token,*(graph.getGraph()),position); - if (token!=0) { - str = token->stringForm(); - } - if (success) { - LDEBUG << "trigger " << v << "[" << str << "]:rule " - << currentRule->getRuleId() << "-> success=" << success - << ",actionSuccess=" << actionSuccess; - LDEBUG << " matched:" << match->getNormalizedString(Common::MediaticData::MediaticData::single().stringsPool(m_language)); - } - else { - LDEBUG << "vertex " << v << "[" << str << "]:rule " - << currentRule->getRuleId() << "-> success= false"; - } - } -#endif - - if (success && actionSuccess) { - if (forbiddenTypes && currentRule->negative()) { - forbiddenTypes->insert(currentRule->getType()); - success = false; - delete match; - match=0; - continue; - } - - RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); - if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { - matches.push_back(*match); - delete match; // a copy has been made - match=0; -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LDEBUG << "Returning from testSetOfRules cause stopAtFirstSuccess (" - << stopAtFirstSuccess << ") or next vertices empty (" - << (recoData->getNextVertices().empty()) - << ")"; - } -#endif - return 1; - } - else { - if (applySameRuleWhileSuccess) { - if (reapplySameRule) { - if (*match==matches.back()) { -// AULOGINIT; -// LDEBUG << "Reapplication of same rule gives same result: " -// << "abort to avoid inifinite loop: " -// << *match << ";" << matches.back(); - delete match; // a copy has been made - match=0; - reapplySameRule=false; - continue; - } -/* else { - LDEBUG << "Reapplication of same rule gives new result"; - }*/ - } - // reapply same rule - rule--; - reapplySameRule=true; - } - -// LDEBUG << "add match to results " << *match; - matches.push_back(*match); - delete match; // a copy has been made - match=0; - - if (onlyOneSuccessPerType) { -/* LDEBUG << "add " << currentRule->getType() - << " in forbiddenTypes";*/ - forbiddenTypes->insert(currentRule->getType()); - } - nbSuccess++; - } - } - else { -// LDEBUG << "-> no success"; - reapplySameRule=false; - } - - if (match !=0) { - delete match; - } - } - - return nbSuccess; -} - -//********************************************************************** -// normalization function -//********************************************************************** -void Recognizer:: -setNormalizedForm(const LimaString& norm, - RecognizerMatch& match) const -{ - match.features().clear(); - - const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); - if (norm.isEmpty()) { - // use surface form of the expression as normalized form - match.features().setFeature(DEFAULT_ATTRIBUTE,match.getNormalizedString(sp)); - } - else { - match.features().setFeature(DEFAULT_ATTRIBUTE,norm); - } -} - -//********************************************************************** -// main functions that applies the recognizer on a graph -//********************************************************************** - -// Apply between two nodes and search between the same ones -uint64_t Recognizer:: - apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - std::vector& result, - bool testAllVertices, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool returnAtFirstSuccess, - bool applySameRuleWhileSuccess) const -{ - return apply(graph, - begin, - end, - begin, - end, - analysis, - result, - testAllVertices, - stopAtFirstSuccess, - onlyOneSuccessPerType, - returnAtFirstSuccess, - applySameRuleWhileSuccess); -} - -// Apply between two nodes and search between two others. -// precondition [begin, end] included in [upstreamBound,downstreamBound] -uint64_t Recognizer:: - apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - const LinguisticGraphVertex& upstreamBound, - const LinguisticGraphVertex& downstreamBound, - AnalysisContent& analysis, - std::vector& result, - bool testAllVertices, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool returnAtFirstSuccess, - bool applySameRuleWhileSuccess) const -{ - - if (returnAtFirstSuccess) { - stopAtFirstSuccess=true; // implied by the other - } - -#ifdef DEBUG_LP - AULOGINIT; - LDEBUG << "apply recognizer " << m_filename << " from vertex " - << begin << " to vertex " << end; - LDEBUG << " up bound: " << upstreamBound << "; down bound: " << downstreamBound << "; testAllVertices: " << testAllVertices; - LDEBUG << " stopAtFirstSuccess: " << stopAtFirstSuccess << "; onlyOneSuccessPerType: " << onlyOneSuccessPerType; - LDEBUG << " returnAtFirstSuccess: " << returnAtFirstSuccess << "; applySameRuleWhileSuccess: " << applySameRuleWhileSuccess; -#endif - - uint64_t numberOfRecognized(0); - bool success(false); - - // use deque instead of queue to be able to clear() - std::deque toVisit; - std::set visited; - - toVisit.push_back(begin); - // patch for inifinite loop : avoid begin stopped at first step - //visited.insert(begin); - - bool lastReached = false; - while (!toVisit.empty()) - { - LinguisticGraphVertex currentVertex=toVisit.front(); - toVisit.pop_front(); - // patch for inifinite loop : check if we already seen this node - if (visited.find(currentVertex) != visited.end()) - { - continue; - } - - visited.insert(currentVertex); -#ifdef DEBUG_LP - LDEBUG << "to visit size=" << toVisit.size() << " ; currentVertex=" << currentVertex; -#endif - - if (lastReached || // limit given by argument - currentVertex == graph.lastVertex()) { // end of the graph - // LDEBUG << "vertex " << currentVertex << " is last vertex"; - continue; // may be other nodes to test in queue - } - if (currentVertex == end ) { // limit given by argument - lastReached = true; - } - - if (currentVertex != graph.firstVertex()) { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: test on vertex " << currentVertex; -#endif - success = testOnVertex(graph,currentVertex, - upstreamBound,downstreamBound, - analysis,result, - stopAtFirstSuccess, - onlyOneSuccessPerType, - applySameRuleWhileSuccess); - if (success) { - numberOfRecognized++; - if (returnAtFirstSuccess) - return numberOfRecognized; - if (! testAllVertices) { // restart from end of recognized expression -#ifdef DEBUG_LP - LDEBUG << "success: continue from vertex " << currentVertex; -#endif - // GC on 20110803: the clearing below was problematic in case of rules like that: - // []:(t_capital_1st|t_capital){1-3} [,]::LOCATION:N_LOCATION - // which matches text before (left) the trigger which is not included in the match. - // thus the next vertex explored was the newly created one ; the vertex following - // it is already visited (this is in this case the comma) and the content of - // toVisit (the vertex after the trigger) was removed. Thus the search stopped after - // the new vertex. - // Warning: what is the inpact on the use of the testAllVertices parameter ? And is there - // any other side effect ? -// toVisit.clear(); - - } - } - } - - // store following nodes to test - LinguisticGraphOutEdgeIt outEdge,outEdge_end; - boost::tie (outEdge,outEdge_end)=out_edges(currentVertex,*(graph.getGraph())); - - for (; outEdge!=outEdge_end; outEdge++) { - LinguisticGraphVertex next=target(*outEdge,*(graph.getGraph())); - if (visited.find(next)==visited.end()) { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: adding out edge target vertex to the 'to visit' list: " << next; -#endif - toVisit.push_back(next); - // do not put in visited unless it is really visited - // (otherwise, may be suppressed when testAllVertices is false - // and never visited) - //visited.insert(next); - } - else { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: already visited:" << next; -#endif - } - } - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); - std::set& nextVertices = recoData->getNextVertices(); - if (recoData != 0 && !nextVertices.empty() ) - { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: adding next vertices to the 'to visit' list"; -#endif - std::set< LinguisticGraphVertex >::const_iterator nvit, nvit_end; - nvit = nextVertices.begin(); - nvit_end = nextVertices.end(); - for (; nvit != nvit_end; nvit++) - { -#ifdef DEBUG_LP - LDEBUG << " - " << *nvit; -#endif - toVisit.push_front(*nvit); - } - nextVertices.clear(); - } -#ifdef DEBUG_LP - LDEBUG << "Recognizer: 'to visit' list size is now: " << toVisit.size(); -#endif - } - return numberOfRecognized; -} - - -//********************************************************************** -// test the recognizer on a vertex : test -//********************************************************************** -uint64_t Recognizer:: -testOnVertex(const LinguisticAnalysisStructure::AnalysisGraph& graph, - LinguisticGraphVertex& current, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - std::vector& result, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool applySameRuleWhileSuccess) const -{ - //AULOGINIT; - Token* token = get(vertex_token, *(graph.getGraph()), current); - MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), current); - - if (token==0) { - AULOGINIT; - LERROR << "no token for vertex " << current; - return 0; - } - - if (data==0) { - AULOGINIT; - LERROR << "no data for vertex " << current; - return 0; - } - - vector matchingRules; - set forbiddenTypes; - uint64_t nbSuccess=0; - - findNextSetOfRules(graph, current, analysis, token, data, matchingRules); - - if (! matchingRules.empty()) { - std::vector::const_iterator - ruleSet=matchingRules.begin(), - ruleSet_end=matchingRules.end(); - for (; ruleSet!=ruleSet_end; ruleSet++) { - uint64_t nbSuccessForTheseRules= - testSetOfRules(*((*ruleSet)->transitionUnit()), - (*ruleSet)->setOfRules(), - graph, current, begin, end,analysis, - result, &forbiddenTypes, - stopAtFirstSuccess, - onlyOneSuccessPerType, - applySameRuleWhileSuccess); - if (nbSuccessForTheseRules>0) { - nbSuccess+=nbSuccessForTheseRules; - // skip recognized part (if the end of the recognized part is after - // current token) - RecognizerMatch& lastSuccess=result.back(); - Token* t=get(vertex_token,*(graph.getGraph()),current); - uint64_t currentTokenEnd=t->position()+t->length(); - RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); - if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { - if (lastSuccess.positionEnd() >= currentTokenEnd) { - current=lastSuccess.getEnd(); - } - break; - } - } - } - for(std::vector::iterator it=matchingRules.begin(), - it_end=matchingRules.end(); it!=it_end; it++) { - if (*it!=0) { - delete (*it); - } - } - } - forbiddenTypes.clear(); - - // LDEBUG << "testOnVertex nb successes: " << nbSuccess; - return nbSuccess; -} - -//********************************************************************** -//resolve the problem of overlapping entities in the list of entities : -// when two entities are overlaping, only one is kept -//********************************************************************** -uint64_t Recognizer:: -resolveOverlappingEntities(std::vector& listEntities, - const OverlapResolutionStrategy& strategy) const -{ - typedef std::vector::iterator vectorRecognizerMatchIterator; - - uint64_t numberOfOverlappingEntities(0); - - if (listEntities.empty()) { - return numberOfOverlappingEntities; - } - - switch (strategy) { - case IGNORE_FIRST: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator nextEntity(currentEntity); - nextEntity++; - while (nextEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*nextEntity)) { - numberOfOverlappingEntities++; - currentEntity=listEntities.erase(currentEntity); - nextEntity=currentEntity; - nextEntity++; - } - else { - currentEntity++; - nextEntity++; - } - } - break; - } - case IGNORE_SECOND: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator previousEntity(currentEntity); - currentEntity++; - while (currentEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*previousEntity)) { - numberOfOverlappingEntities++; - currentEntity=listEntities.erase(currentEntity); - } - else { - previousEntity++; - currentEntity++; - } - } - break; - } - case IGNORE_SMALLEST: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator previousEntity(currentEntity); - currentEntity++; - while (currentEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*previousEntity)) { - numberOfOverlappingEntities++; - if (currentEntity->numberOfElements() - < previousEntity->numberOfElements()) { // keep previous entity - currentEntity=listEntities.erase(currentEntity); - } - else { // keep current entity - previousEntity=listEntities.erase(previousEntity); - currentEntity=previousEntity; - currentEntity++; - } - } - else { - previousEntity++; - currentEntity++; - } - } - break; - } - default: - break; - } - - return numberOfOverlappingEntities; -} - -//********************************************************************** -// find the set of rules in the recognizer that accept -// a particular token as trigger -//********************************************************************** -void Recognizer:: -findNextSetOfRules(const LinguisticAnalysisStructure::AnalysisGraph& graph, - LinguisticGraphVertex& vertex, - AnalysisContent& analysis, - const LinguisticAnalysisStructure::Token* token, - const LinguisticAnalysisStructure::MorphoSyntacticData* data, - std::vector& matchingSetOfRules) const -{ - matchingSetOfRules.clear(); - - // find matching rules - std::vector matchingRules; - m_searchStructure.findMatchingTransitions(graph,vertex,analysis,token,data,matchingRules); - - // matching rules are gathered by common trigger (transition unit) - // we have to re-sort the rules by their weight at a global level, independently of the trigger - // create a vector of TriggerRule where each contains only one rule, then sort it - for (std::vector::const_iterator it=matchingRules.begin(),it_end=matchingRules.end();it!=it_end;it++) { - for (SetOfRules::const_iterator r=(*it)->setOfRules().begin(),r_end=(*it)->setOfRules().end(); r!=r_end;r++) { - matchingSetOfRules.push_back(new TriggerRule((*it)->transitionUnit(),SetOfRules(1,*r))); - } - } - sort(matchingSetOfRules.begin(),matchingSetOfRules.end(),CompareTriggerRule()); - - // then, gather rules with the same trigger that are consecutive in this new list - // (may save some constraint checking on trigger) - if (! matchingSetOfRules.empty()) { - std::vector::iterator it=matchingSetOfRules.begin(); - TransitionUnit* currentTrigger=(*it)->transitionUnit(); - std::vector::iterator next=it; - next++; - while (next!=matchingSetOfRules.end()) { - if ((*next)->transitionUnit() == currentTrigger) { - (*it)->second.push_back((*next)->setOfRules().front()); - delete *next; - next=matchingSetOfRules.erase(next); - } - else { - it++; - currentTrigger=(*it)->transitionUnit(); - next++; - } - } - } -} - -void Recognizer::initializeSearchStructure() { - const Common::PropertyCode::PropertyAccessor* macro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); - const Common::PropertyCode::PropertyAccessor* micro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); - m_searchStructure.init(m_rules,macro,micro); -} - -void Recognizer::clearSearchStructure() { - m_searchStructure.clear(); -} - -//********************************************************************** -// adding a rule -//********************************************************************** -uint64_t Recognizer::addRuleInStorage(Rule* rule) -{ - // add the rule in the storage - m_ruleStorage.push_back(rule); - // return the index of the rule in the storage - return (m_ruleStorage.size() - 1); -} - -uint64_t Recognizer::addRule(TransitionUnit* trigger, Rule* rule) -{ - uint64_t indexRule=addRuleInStorage(rule); - - // find if the trigger already exists in the set of triggers - for (uint64_t i(0); iclone(), - SetOfRules(1,rule))); - - return indexRule; -} - -void Recognizer::addRule(TransitionUnit* trigger, - const uint64_t index) -{ - // find if the trigger already exists in the set of triggers - for (uint64_t i(0); iclone(), - SetOfRules(1,m_ruleStorage[index]))); -} - -//********************************************************************** -// input/output in a binary format -//********************************************************************** -// void Recognizer::readFromTextFile(std::string filename) { -// RecognizerCompiler::buildRecognizer(*this,filename); -// } - -// simple linear search (called only with write function -> not optimized) -uint64_t Recognizer::findRuleIndex(Rule* r) const -{ - for (uint64_t i(0); i" << m_rules[i].first->printValue() << "" - << "" << i << "" << endl; - } -} - -//*************************************************************************** -// output -//*************************************************************************** -ostream& operator << (ostream& os, const Recognizer& r) -{ - for (uint64_t i(0); i +*/ +/************************************************************************ +* +* File : recognizer.cpp +* Author : Romaric Besancon (besanconr@zoe.cea.fr) +* Created on : Tue Oct 15 2002 +* Copyright : (c) 2002 by CEA +* +************************************************************************/ + +#include "recognizer.h" + +#include "transitionSearchStructure.h" +#include "automatonCommon.h" +#include "transitionUnit.h" +#include "recognizerData.h" +#include "common/tools/FileUtils.h" +#include "common/Data/LimaString.h" +#include "common/MediaticData/EntityType.h" +#include "common/MediaticData/mediaticData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::ApplyRecognizer; + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +// a comparison operator on Rule pointer: +// to sort SetOfRules on decreasing rule weights +class CompareRulePtr { +public: + bool operator()(Rule* r1,Rule* r2) { + return (r1->getWeight() > r2->getWeight()); + } +}; + + +// a comparison operator on TriggerRule +class Recognizer::CompareTriggerRule { +public: + bool operator()(const Recognizer::TriggerRule* r1, + const Recognizer::TriggerRule* r2) { + return (r1->setOfRules().front()->getWeight() > + r2->setOfRules().front()->getWeight()); + } +}; + + +/** recognizer factory */ +SimpleFactory recognizerFactory(RECOGNIZER_CLASSID); + +//********************************************************************** +// constructors +//********************************************************************** +Recognizer::Recognizer(): + AbstractResource(), + m_rules(0), + m_ruleStorage(0), + m_language(), + m_automatonControlParams(), + m_filename(), + m_searchStructure() +{ } + +// copy is complex because of the pointers +Recognizer::Recognizer(const Recognizer& r): +AbstractResource(r) +{ + init(); + copy(r); + + // have to initialize the search structure of the new recognizer + initializeSearchStructure(); +} + +//********************************************************************** +// destructor +//********************************************************************** +Recognizer::~Recognizer() +{ + freeMem(); + clearSearchStructure(); +} + +//********************************************************************** +// copy +//********************************************************************** +Recognizer& Recognizer::operator = (const Recognizer& r) +{ + if (this != &r) + { + freeMem(); + init(); + copy(r); + } + + // do not copy the search structure : recompute it the new recognizer + // (not sure the copy is less complex than recomputing it) + initializeSearchStructure(); + + return (*this); +} + +void Recognizer::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + + /** @addtogroup ResourceConfiguration + * - <group name="..." class="AutomatonRecognizer"> + * -  rules : file containing the compiled rules of the recognizer + * -  maxDepthStack : maximum size of stack in depth-first-search + * when testing a rule (default is 100) + * -  maxTransitionsExplored : max number of transitions explored + * when testing a rule (default is 1000) + * -  maxNbResults : max number of results temporarily stored + * when testing a rule (default is 50) + * -  maxResultSize : max size of a result for a rule match + * (this parameter can be seen as the effective size of + * "n" when using {0-n} in a rule) (default is 200) + */ + + m_language=manager->getInitializationParameters().language; + string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); + try + { + QString rulesFile = unitConfiguration.getParamsValueAtKey("rules").c_str(); + if (!rulesFile.isEmpty()) + { + m_filename=rulesFile.toUtf8().constData(); + rulesFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), rulesFile); +// LDEBUG << "read recognizer from file : " << rulesFile; + //readFromFile(rulesFile); + AutomatonReader reader; + reader.readRecognizer(rulesFile.toUtf8().constData(),*this); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + AULOGINIT; + LERROR << "No param 'rules' in recognizer group for language " << (int)m_language; + throw InvalidConfiguration(); + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxDepthStack"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxDepthStack is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxDepthStack(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxTransitionsExplored"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxTransitionsExplored is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxTransitionsExplored(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxNbResults"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxNbResults is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxNbResults(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxResultSize"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxResultSize is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxResultSize(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + Common::MediaticData::MediaticData::changeable().stringsPool(m_language).endResourcesPool(); +} + +//********************************************************************** +// helper functions for constructors and destructors +//********************************************************************** +void Recognizer::init() +{ + m_rules.clear(); + m_ruleStorage.clear(); + m_language=UNDEFLANG; + m_automatonControlParams=AutomatonControlParams(); +} + +void Recognizer::copy(const Recognizer& r) +{ + map pointersMap; + + for (uint64_t i(0); iclone(); + m_rules.push_back(TriggerRule(t,SetOfRules(0))); + for (uint64_t j(0); j results; + if (testSetOfRules(*(m_rules[offset].first), + m_rules[offset].second, + graph, + current, + graph.firstVertex(), + graph.lastVertex(), + analysis, + results)) + { + result=results.front(); // only one result because stopAtFirstSuccess=true + return true; + } + return false; +} +*/ + +//********************************************************************** +// test a set of rules for a trigger +uint64_t Recognizer::testSetOfRules(const TransitionUnit& trigger, + const SetOfRules& rules, + const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + vector& matches, + std::set* forbiddenTypes, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool applySameRuleWhileSuccess) const { + AULOGINIT; + // If the trigger is defined with a gazeteer, we must check the case of multi-term elements in the gazeteer + const GazeteerTransition* gazeteerTrigger = dynamic_cast(&trigger); + RecognizerMatch triggermatch(&graph); + LinguisticGraphVertex right=position; + if( gazeteerTrigger != 0 ) { + Token* token = get(vertex_token, *(graph.getGraph()), position); + MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), position); + deque vertices; + ForwardSearch searchGraph; + bool match = gazeteerTrigger->matchPath(graph, position, end, &searchGraph, analysis, token, vertices, data); + if( match ) { + for( std::deque::const_iterator vIt = vertices.begin(); vIt != vertices.end() ; vIt++ ) { + triggermatch.addBackVertex(*vIt,trigger.keep(),"trigger"); + } + } + } + else { + triggermatch.addBackVertex(position,trigger.keep(),"trigger"); + right=position; + } + + RecognizerMatch leftmatch(&graph); + RecognizerMatch rightmatch(&graph); + + if (onlyOneSuccessPerType && forbiddenTypes==0) { + LERROR << "Recognizer::testSetOfRules: cannot use onlyOneSuccessPerType " + << "when forbidden types are not allowed"; + onlyOneSuccessPerType=false; + } + + + uint64_t nbSuccess(0); + + // left context is same LinguisticAnalysisStructure::AnalysisGraph as current (current is in fact + // between the current token and the previous one) + LinguisticGraphVertex left=position; + +#ifdef DEBUG_LP + LDEBUG << "Recognizer::testSetOfRules: testing set of rules triggered by " << trigger << " on vertex " << position; + LDEBUG << "onlyOneSuccessPerType=" << onlyOneSuccessPerType; + if (logger.isDebugEnabled()) { + std::ostringstream oss; + for (SetOfRules::const_iterator it=rules.begin(),it_end=rules.end();it!=it_end;it++) { + oss << " - " << (*it)->getWeight(); + } + LDEBUG << "Rule weights" << oss.str(); + } +#endif + + bool reapplySameRule(false); + + SetOfRules::const_iterator + #ifdef ANTINNO_BUGFIX + // FWI 19/12/2013 : ajout dfinition de "rule_begin" + rule_begin=rules.begin(), +#endif + rule=rules.begin(), + rule_end=rules.end(); + for (; rule!=rule_end; rule++) { + Rule* currentRule=*rule; + +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LDEBUG << "Recognizer::testSetOfRules: testing rule "<<*currentRule << "," << currentRule->getRuleId() <<" of type " + << currentRule->getType() << ",reapply=" + << reapplySameRule << " from " << position; + } +#endif + + if (forbiddenTypes && + forbiddenTypes->find(currentRule->getType()) + != forbiddenTypes->end()) { + // type previously forbidden by a negative rule +/* LDEBUG << "type " << currentRule->getType() + << " is forbidden: continue";*/ + continue; + } + + // initializes the constraint checklist + ConstraintCheckList + constraintCheckList(currentRule->numberOfConstraints(), + ConstraintCheckListElement(graph)); + + // treat the constraints for the trigger with the constraint + // checklist corresponding to this rule + //Token* token=get(vertex_token,*(graph.getGraph()),position); +// LDEBUG << "Recognizer: checking trigger constraints: "; + + if (!trigger.checkConstraints(graph,position,analysis, + constraintCheckList)) { + // one unary constraint was not verified +// LDEBUG << "one unary constraint on trigger not verified"; + + // apply actions (for actions triggered by failure) + if (!currentRule->negative()) { + currentRule->executeActions(graph, analysis, + constraintCheckList, + false, + 0); // match is not used +// LDEBUG << "actionSuccess=" << actionSuccess; + } + continue; + } + + leftmatch.reinit(); + rightmatch.reinit(); + ForwardSearch forward; + BackwardSearch backward; + bool success = currentRule->test(graph, left, right, + begin, end, analysis, + leftmatch, rightmatch, + constraintCheckList,forward,backward, + m_automatonControlParams); + //LDEBUG << "success=" << success; + + RecognizerMatch* match=0; + + if (success) { + // build complete match + + match=new RecognizerMatch(leftmatch); + if (leftmatch.getHead() != 0) { + match->setHead(leftmatch.getHead()); + } + + // TODO: add node of gazeteerTrigger + //match->addBackVertex(position,trigger.keep(), "trigger"); + /* + RecognizerMatch::const_iterator triggerMatchIt = triggermatch.begin(); + for( ; triggerMatchIt != triggermatch.end(); triggerMatchIt++) { + match->addBackVertex(*triggerMatchIt,trigger.keep(), "trigger"); + } + */ + match->addBack(triggermatch); + match->addBack(rightmatch); + // remove elements not kept at begin and end of the expression + match->removeUnkeptAtExtremity(); + + // check if trigger is head + match->setType(currentRule->getType()); + match->setLinguisticProperties(currentRule->getLinguisticProperties()); + match->setContextual(currentRule->contextual()); + setNormalizedForm(currentRule->getNormalizedForm(),*match); + } + + // execute possible actions associated to the rule iff current rule is + // positive + //LDEBUG << "Recognizer: executing actions: "; + bool actionSuccess = true; + if (!currentRule->negative()) { + actionSuccess = currentRule->executeActions(graph, analysis, + constraintCheckList, + success, + match); + //LDEBUG << "actionSuccess=" << actionSuccess; + } + +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LinguisticGraphVertex v=position; + LimaString str(""); + Token* token=get(vertex_token,*(graph.getGraph()),position); + if (token!=0) { + str = token->stringForm(); + } + if (success) { + LDEBUG << "Recognizer::testSetOfRules: trigger " << v << "[" << str << "]:rule " + << currentRule->getRuleId() << "-> success=" << success + << ",actionSuccess=" << actionSuccess; + LDEBUG << " matched:" << match->getNormalizedString(Common::MediaticData::MediaticData::single().stringsPool(m_language)); + } + else { + LDEBUG << "Recognizer::testSetOfRules: vertex " << v << "[" << str << "]:rule " + << currentRule->getRuleId() << "-> success= false"; + } + } +#endif + + if (success && actionSuccess) { + if (forbiddenTypes && currentRule->negative()) { + forbiddenTypes->insert(currentRule->getType()); + success = false; + delete match; + match=0; + continue; + } + LINFO << "Recognizer::testSetOfRules: execute rule " << currentRule->getRuleId() + << " of type "<< currentRule->getType() + << "(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(currentRule->getType()) + << ") on vertex " << position; + RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); + if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { + matches.push_back(*match); + delete match; // a copy has been made + match=0; +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LDEBUG << "Recognizer::testSetOfRules: Returning from testSetOfRules cause stopAtFirstSuccess (" + << stopAtFirstSuccess << ") or next vertices empty (" + << (recoData->getNextVertices().empty()) + << ")"; + } +#endif + return 1; + } + else { + if (applySameRuleWhileSuccess) { + if (reapplySameRule) { + if (*match==matches.back()) { +// AULOGINIT; +// LDEBUG << "Reapplication of same rule gives same result: " +// << "abort to avoid inifinite loop: " +// << *match << ";" << matches.back(); + delete match; // a copy has been made + match=0; + reapplySameRule=false; + continue; + } +/* else { + LDEBUG << "Reapplication of same rule gives new result"; + }*/ + } + // reapply same rule + #ifdef ANTINNO_BUGFIX + // FWI 19/12/2013 : ajout test pour ne faire le -- que si ncessaire + if (rule != rule_begin) +#endif + rule--; + + reapplySameRule=true; + } + +// LDEBUG << "add match to results " << *match; + matches.push_back(*match); + delete match; // a copy has been made + match=0; + + if (onlyOneSuccessPerType) { +/* LDEBUG << "add " << currentRule->getType() + << " in forbiddenTypes";*/ + forbiddenTypes->insert(currentRule->getType()); + } + nbSuccess++; + } + } + else { +// LDEBUG << "-> no success"; + reapplySameRule=false; + } + + if (match !=0) { + delete match; + } + } + + return nbSuccess; +} + +//********************************************************************** +// normalization function +//********************************************************************** +void Recognizer:: +setNormalizedForm(const LimaString& norm, + RecognizerMatch& match) const +{ +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + AULOGINIT +#endif +#endif + + match.features().clear(); + + const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); + if (norm.isEmpty()) { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "Recognizer::setNormalizedForm(norm=""): match.getNormalizedString(sp)= " << match.getNormalizedString(sp); +#endif +#endif + // use surface form of the expression as normalized form + match.features().setFeature(DEFAULT_ATTRIBUTE,match.getNormalizedString(sp)); + } + else { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "Recognizer::setNormalizedForm(norm): norm= " << norm; +#endif +#endif + match.features().setFeature(DEFAULT_ATTRIBUTE,norm); + } +} + +//********************************************************************** +// main functions that applies the recognizer on a graph +//********************************************************************** + +// Apply between two nodes and search between the same ones +uint64_t Recognizer:: + apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + std::vector& result, + bool testAllVertices, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool returnAtFirstSuccess, + bool applySameRuleWhileSuccess) const +{ + return apply(graph, + begin, + end, + begin, + end, + analysis, + result, + testAllVertices, + stopAtFirstSuccess, + onlyOneSuccessPerType, + returnAtFirstSuccess, + applySameRuleWhileSuccess); +} + +// Apply between two nodes and search between two others. +// precondition [begin, end] included in [upstreamBound,downstreamBound] +uint64_t Recognizer:: + apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + const LinguisticGraphVertex& upstreamBound, + const LinguisticGraphVertex& downstreamBound, + AnalysisContent& analysis, + std::vector& result, + bool testAllVertices, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool returnAtFirstSuccess, + bool applySameRuleWhileSuccess) const +{ +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + if (returnAtFirstSuccess) { + stopAtFirstSuccess=true; // implied by the other + } + +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "apply recognizer " << m_filename << " from vertex " + << begin << " to vertex " << end; + LDEBUG << " up bound: " << upstreamBound << "; down bound: " << downstreamBound << "; testAllVertices: " << testAllVertices; + LDEBUG << " stopAtFirstSuccess: " << stopAtFirstSuccess << "; onlyOneSuccessPerType: " << onlyOneSuccessPerType; + LDEBUG << " returnAtFirstSuccess: " << returnAtFirstSuccess << "; applySameRuleWhileSuccess: " << applySameRuleWhileSuccess; +#endif + + uint64_t numberOfRecognized(0); + bool success(false); + + // use deque instead of queue to be able to clear() + std::deque toVisit; + std::set visited; + + toVisit.push_back(begin); + // patch for inifinite loop : avoid begin stopped at first step + //visited.insert(begin); + + bool lastReached = false; + while (!toVisit.empty()) + { + LinguisticGraphVertex currentVertex=toVisit.front(); + toVisit.pop_front(); + // patch for inifinite loop : check if we already seen this node + if (visited.find(currentVertex) != visited.end()) + { + continue; + } + + visited.insert(currentVertex); +#ifdef DEBUG_LP + LDEBUG << "to visit size=" << toVisit.size() << " ; currentVertex=" << currentVertex; +#endif + + if (lastReached || // limit given by argument + currentVertex == graph.lastVertex()) { // end of the graph + // LDEBUG << "vertex " << currentVertex << " is last vertex"; + continue; // may be other nodes to test in queue + } + if (currentVertex == end ) { // limit given by argument + lastReached = true; + } + + if (currentVertex != graph.firstVertex()) { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: test on vertex " << currentVertex; +#endif +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { +#if !defined DEBUG_LP + AULOGINIT; +#endif + LERROR << "Stopped in Recognizer"; + return 0; + } +#endif + success = testOnVertex(graph,currentVertex, + upstreamBound,downstreamBound, + analysis,result, + stopAtFirstSuccess, + onlyOneSuccessPerType, + applySameRuleWhileSuccess); + if (success) { + numberOfRecognized++; + if (returnAtFirstSuccess) + return numberOfRecognized; + if (! testAllVertices) { // restart from end of recognized expression +#ifdef DEBUG_LP + LDEBUG << "success: continue from vertex " << currentVertex; +#endif + // GC on 20110803: the clearing below was problematic in case of rules like that: + // []:(t_capital_1st|t_capital){1-3} [,]::LOCATION:N_LOCATION + // which matches text before (left) the trigger which is not included in the match. + // thus the next vertex explored was the newly created one ; the vertex following + // it is already visited (this is in this case the comma) and the content of + // toVisit (the vertex after the trigger) was removed. Thus the search stopped after + // the new vertex. + // Warning: what is the inpact on the use of the testAllVertices parameter ? And is there + // any other side effect ? +// toVisit.clear(); + + } + } + } + + // store following nodes to test + LinguisticGraphOutEdgeIt outEdge,outEdge_end; + boost::tie (outEdge,outEdge_end)=out_edges(currentVertex,*(graph.getGraph())); + + for (; outEdge!=outEdge_end; outEdge++) { + LinguisticGraphVertex next=target(*outEdge,*(graph.getGraph())); + if (visited.find(next)==visited.end()) { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: adding out edge target vertex to the 'to visit' list: " << next; +#endif + toVisit.push_back(next); + // do not put in visited unless it is really visited + // (otherwise, may be suppressed when testAllVertices is false + // and never visited) + //visited.insert(next); + } + else { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: already visited:" << next; +#endif + } + } + RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + std::set& nextVertices = recoData->getNextVertices(); + if (recoData != 0 && !nextVertices.empty() ) + { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: adding next vertices to the 'to visit' list"; +#endif + std::set< LinguisticGraphVertex >::const_iterator nvit, nvit_end; + nvit = nextVertices.begin(); + nvit_end = nextVertices.end(); + for (; nvit != nvit_end; nvit++) + { +#ifdef DEBUG_LP + LDEBUG << " - " << *nvit; +#endif + toVisit.push_front(*nvit); + } + nextVertices.clear(); + } +#ifdef DEBUG_LP + LDEBUG << "Recognizer: 'to visit' list size is now: " << toVisit.size(); +#endif + } + return numberOfRecognized; +} + + +//********************************************************************** +// test the recognizer on a vertex : test +//********************************************************************** +uint64_t Recognizer:: +testOnVertex(const LinguisticAnalysisStructure::AnalysisGraph& graph, + LinguisticGraphVertex& current, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + std::vector& result, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool applySameRuleWhileSuccess) const +{ + //AULOGINIT; + Token* token = get(vertex_token, *(graph.getGraph()), current); + MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), current); + + if (token==0) { + AULOGINIT; + LERROR << "no token for vertex " << current; + return 0; + } + + if (data==0) { + AULOGINIT; + LERROR << "no data for vertex " << current; + return 0; + } + + vector matchingRules; + set forbiddenTypes; + uint64_t nbSuccess=0; + + findNextSetOfRules(graph, current, analysis, token, data, matchingRules); + + if (! matchingRules.empty()) { + std::vector::const_iterator + ruleSet=matchingRules.begin(), + ruleSet_end=matchingRules.end(); + for (; ruleSet!=ruleSet_end; ruleSet++) { + uint64_t nbSuccessForTheseRules= + testSetOfRules(*((*ruleSet)->transitionUnit()), + (*ruleSet)->setOfRules(), + graph, current, begin, end,analysis, + result, &forbiddenTypes, + stopAtFirstSuccess, + onlyOneSuccessPerType, + applySameRuleWhileSuccess); + if (nbSuccessForTheseRules>0) { + nbSuccess+=nbSuccessForTheseRules; + // skip recognized part (if the end of the recognized part is after + // current token) + RecognizerMatch& lastSuccess=result.back(); + Token* t=get(vertex_token,*(graph.getGraph()),current); + uint64_t currentTokenEnd=t->position()+t->length(); + RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); + if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { + if (lastSuccess.positionEnd() >= currentTokenEnd) { + current=lastSuccess.getEnd(); + } + break; + } + } + } + for(std::vector::iterator it=matchingRules.begin(), + it_end=matchingRules.end(); it!=it_end; it++) { + if (*it!=0) { + delete (*it); + } + } + } + forbiddenTypes.clear(); + + // LDEBUG << "testOnVertex nb successes: " << nbSuccess; + return nbSuccess; +} + +//********************************************************************** +//resolve the problem of overlapping entities in the list of entities : +// when two entities are overlaping, only one is kept +//********************************************************************** +uint64_t Recognizer:: +resolveOverlappingEntities(std::vector& listEntities, + const OverlapResolutionStrategy& strategy) const +{ + typedef std::vector::iterator vectorRecognizerMatchIterator; + + uint64_t numberOfOverlappingEntities(0); + + if (listEntities.empty()) { + return numberOfOverlappingEntities; + } + + switch (strategy) { + case IGNORE_FIRST: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator nextEntity(currentEntity); + nextEntity++; + while (nextEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*nextEntity)) { + numberOfOverlappingEntities++; + currentEntity=listEntities.erase(currentEntity); + nextEntity=currentEntity; + nextEntity++; + } + else { + currentEntity++; + nextEntity++; + } + } + break; + } + case IGNORE_SECOND: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator previousEntity(currentEntity); + currentEntity++; + while (currentEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*previousEntity)) { + numberOfOverlappingEntities++; + currentEntity=listEntities.erase(currentEntity); + } + else { + previousEntity++; + currentEntity++; + } + } + break; + } + case IGNORE_SMALLEST: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator previousEntity(currentEntity); + currentEntity++; + while (currentEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*previousEntity)) { + numberOfOverlappingEntities++; + if (currentEntity->numberOfElements() + < previousEntity->numberOfElements()) { // keep previous entity + currentEntity=listEntities.erase(currentEntity); + } + else { // keep current entity + previousEntity=listEntities.erase(previousEntity); + currentEntity=previousEntity; + currentEntity++; + } + } + else { + previousEntity++; + currentEntity++; + } + } + break; + } + default: + break; + } + + return numberOfOverlappingEntities; +} + +//********************************************************************** +// find the set of rules in the recognizer that accept +// a particular token as trigger +//********************************************************************** +void Recognizer:: +findNextSetOfRules(const LinguisticAnalysisStructure::AnalysisGraph& graph, + LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data, + std::vector& matchingSetOfRules) const +{ + matchingSetOfRules.clear(); + + // find matching rules + std::vector matchingRules; + m_searchStructure.findMatchingTransitions(graph,vertex,analysis,token,data,matchingRules); + + // matching rules are gathered by common trigger (transition unit) + // we have to re-sort the rules by their weight at a global level, independently of the trigger + // create a vector of TriggerRule where each contains only one rule, then sort it + for (std::vector::const_iterator it=matchingRules.begin(),it_end=matchingRules.end();it!=it_end;it++) { + for (SetOfRules::const_iterator r=(*it)->setOfRules().begin(),r_end=(*it)->setOfRules().end(); r!=r_end;r++) { + matchingSetOfRules.push_back(new TriggerRule((*it)->transitionUnit(),SetOfRules(1,*r))); + } + } + sort(matchingSetOfRules.begin(),matchingSetOfRules.end(),CompareTriggerRule()); + + // then, gather rules with the same trigger that are consecutive in this new list + // (may save some constraint checking on trigger) + if (! matchingSetOfRules.empty()) { + std::vector::iterator it=matchingSetOfRules.begin(); + TransitionUnit* currentTrigger=(*it)->transitionUnit(); + std::vector::iterator next=it; + next++; + while (next!=matchingSetOfRules.end()) { + if ((*next)->transitionUnit() == currentTrigger) { + (*it)->second.push_back((*next)->setOfRules().front()); + delete *next; + next=matchingSetOfRules.erase(next); + } + else { + it++; + currentTrigger=(*it)->transitionUnit(); + next++; + } + } + } +} + +void Recognizer::initializeSearchStructure() { + const Common::PropertyCode::PropertyAccessor* macro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); + const Common::PropertyCode::PropertyAccessor* micro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); + m_searchStructure.init(m_rules,macro,micro); +} + +void Recognizer::clearSearchStructure() { + m_searchStructure.clear(); +} + +//********************************************************************** +// adding a rule +//********************************************************************** +uint64_t Recognizer::addRuleInStorage(Rule* rule) +{ + // add the rule in the storage + m_ruleStorage.push_back(rule); + // return the index of the rule in the storage + return (m_ruleStorage.size() - 1); +} + +uint64_t Recognizer::addRule(TransitionUnit* trigger, Rule* rule) +{ + uint64_t indexRule=addRuleInStorage(rule); + + // find if the trigger already exists in the set of triggers + for (uint64_t i(0); iclone(), + SetOfRules(1,rule))); + + return indexRule; +} + +void Recognizer::addRule(TransitionUnit* trigger, + const uint64_t index) +{ + // find if the trigger already exists in the set of triggers + for (uint64_t i(0); iclone(), + SetOfRules(1,m_ruleStorage[index]))); +} + +//********************************************************************** +// input/output in a binary format +//********************************************************************** +// void Recognizer::readFromTextFile(std::string filename) { +// RecognizerCompiler::buildRecognizer(*this,filename); +// } + +// simple linear search (called only with write function -> not optimized) +uint64_t Recognizer::findRuleIndex(Rule* r) const +{ + for (uint64_t i(0); i" << m_rules[i].first->printValue() << "" + << "" << i << "" << endl; + } +} + +//*************************************************************************** +// output +//*************************************************************************** +ostream& operator << (ostream& os, const Recognizer& r) +{ + for (uint64_t i(0); i(name,value); } + void addVertexAsEmbededEntity(const LinguisticGraphVertex& vertex) + { + m_embededEntities.insert(vertex); + } + bool hasVertexAsEmbededEntity(const LinguisticGraphVertex& vertex) const + { + return (m_embededEntities.find(vertex) != m_embededEntities.end()); + } void clearEntityFeatures(); Automaton::EntityFeatures& getEntityFeatures() { return m_entityFeatures; } @@ -155,6 +163,8 @@ class LIMA_AUTOMATON_EXPORT RecognizerData : public AnalysisData // EntityFeatures : for functions to add features Automaton::EntityFeatures m_entityFeatures; + // embededEntities : set of embeded entities + std::set< LinguisticGraphVertex > m_embededEntities; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp index d3f227f8e..b064aa7c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp @@ -211,6 +211,13 @@ LimaString RecognizerMatch::getNormalizedString(const FsaStringsPool& sp) const v != m_graph->lastVertex()) { if ((*i).isKept()) { Token* t = get(vertex_token,*(m_graph->getGraph()),v); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LOGINIT("LP::Automaton"); + LDEBUG << "RecognizerMatch::getNormalizedString(...) token.form(): " << t->form(); + LDEBUG << "RecognizerMatch::getNormalizedString(...) token.stringForm(): " << t->stringForm(); +#endif +#endif if (t->status().isAlphaHyphen()) { firstHyphenPassed = true; @@ -222,6 +229,12 @@ LimaString RecognizerMatch::getNormalizedString(const FsaStringsPool& sp) const } else { // take first norm +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LOGINIT("LP::Automaton"); + LDEBUG << "RecognizerMatch::getNormalizedString(...) data->front().normalizedForm: " << data->front().normalizedForm; +#endif +#endif str += sp[data->front().normalizedForm]; } currentPosition=t->position()+t->length(); @@ -291,6 +304,9 @@ isOverlapping(const RecognizerMatch& otherMatch) const { //********************************************************************** void RecognizerMatch::addBackVertex(const LinguisticGraphVertex& v, bool isKept, const LimaString& ruleElementId ) { + AULOGINIT; + LDEBUG << "RecognizerMatch:addBackVertex(v:" << v << ", isKept:" << isKept << ", ruleElmtId:" << ruleElementId << ")"; + push_back(MatchElement(v,isKept, ruleElementId)); } @@ -303,6 +319,8 @@ void RecognizerMatch::popBackVertex() { void RecognizerMatch::addFrontVertex(const LinguisticGraphVertex& v, bool isKept, const LimaString& ruleElementId) { + AULOGINIT; + LDEBUG << "RecognizerMatch:addFrontVertex(v:" << v << ", isKept:" << isKept << ", ruleElmtId:" << ruleElementId << ")"; insert(begin(),MatchElement(v,isKept,ruleElementId)); } @@ -314,10 +332,16 @@ void RecognizerMatch::popFrontVertex() { } void RecognizerMatch::addBack(const RecognizerMatch& l) { + if( l.getHead() != 0 ){ + setHead(l.getHead()); + } insert(end(),l.begin(),l.end()); } void RecognizerMatch::addFront(const RecognizerMatch& l) { + if( l.getHead() != 0 ){ + setHead(l.getHead()); + } insert(begin(),l.begin(),l.end()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp index 9ebb67b53..d887ad8c7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp @@ -302,7 +302,7 @@ bool Rule::executeActions(const LinguisticAnalysisStructure::AnalysisGraph& grap LDEBUG << "Rule::executeActions: check vertex " << matchElmt->m_elem.first << " with " << matchElmt->getRuleElemtId(); #endif - if( matchElmt->getRuleElemtId() == ruelElemtId ) { + if( (matchElmt->getRuleElemtId()).startsWith(ruelElemtId) ) { #ifdef DEBUG_LP LDEBUG << "Rule::executeActions: found " << matchElmt->m_elem.first; #endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h index b400f3f54..b6b72141f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h @@ -37,6 +37,9 @@ #include "transitionUnit.h" #include "automatonCommon.h" #include +#ifdef ANTINNO_SPECIFIC +#include "searchGraph.h" +#endif namespace Lima { namespace LinguisticProcessing { @@ -67,6 +70,15 @@ class TransitionSearchStructure const LinguisticAnalysisStructure::Token* token, const LinguisticAnalysisStructure::MorphoSyntacticData* data, std::vector& matchingSetOfRules) const; + uint64_t + findMatchingTransitions2(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data, + std::vector,const TargetType*> >& matchingSetOfRules) const; // for debug only void printStructure(std::ostream& os) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h index f52023641..7c11cc1ce 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h @@ -56,7 +56,9 @@ typedef enum { T_AND, T_SET, T_DEACCENTUATED, - T_ENTITY + T_ENTITY, + T_ENTITY_GROUP, + T_GAZETEER } TypeTransition; // useful for the read/write functions class LIMA_AUTOMATON_EXPORT TransitionUnit diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp index c344bc0a9..7a729cd9c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp @@ -33,6 +33,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/Data/strwstrtools.h" #include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/LinguisticProcessingCommon.h" #include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" #include "common/MediaProcessors/MediaProcessors.h" @@ -42,6 +43,7 @@ #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" #include +#include uint64_t t1; @@ -58,15 +60,13 @@ namespace Lima namespace LinguisticProcessing { -CoreLinguisticProcessingClientFactory* CoreLinguisticProcessingClientFactory::s_instance=new CoreLinguisticProcessingClientFactory(); +std::unique_ptr CoreLinguisticProcessingClientFactory::s_instance=std::unique_ptr(new CoreLinguisticProcessingClientFactory()); CoreLinguisticProcessingClient::CoreLinguisticProcessingClient() {} CoreLinguisticProcessingClient::~CoreLinguisticProcessingClient() { - delete LinguisticResources::pchangeable(); - delete MediaProcessors::pchangeable(); } void CoreLinguisticProcessingClient::analyze( @@ -74,12 +74,19 @@ void CoreLinguisticProcessingClient::analyze( const std::map& metaData, const std::string& pipelineId, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits, StopAnalyze const& stopAnalyze) const +#else const std::set& inactiveUnits) const - +#endif { LimaString limatexte=Common::Misc::utf8stdstring2limastring(texte); - +#ifdef ANTINNO_SPECIFIC + analyze(limatexte,metaData,pipelineId,handlers,inactiveUnits, stopAnalyze); +#else analyze(limatexte,metaData,pipelineId,handlers,inactiveUnits); +#endif + } void CoreLinguisticProcessingClient::analyze( @@ -87,13 +94,21 @@ void CoreLinguisticProcessingClient::analyze( const std::map& metaData, const std::string& pipelineId, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits, StopAnalyze const& stopAnalyze) const +#else const std::set& inactiveUnits) const +#endif { Lima::TimeUtilsController timer("CoreLinguisticProcessingClient::analyze"); CORECLIENTLOGINIT; // create analysis content +#ifdef ANTINNO_SPECIFIC + AnalysisContent analysis(stopAnalyze); +#else AnalysisContent analysis; +#endif LinguisticMetaData* metadataholder=new LinguisticMetaData(); // will be destroyed in AnalysisContent destructor analysis.setData("LinguisticMetaData",metadataholder); @@ -258,7 +273,6 @@ void CoreLinguisticProcessingClientFactory::configure( } } - string configPath=Common::MediaticData::MediaticData::single().getConfigPath(); for (deque::const_iterator langItr=langToload.begin(); langItr!=langToload.end(); langItr++) @@ -268,17 +282,30 @@ void CoreLinguisticProcessingClientFactory::configure( string file; try { - file=configPath + "/" + configuration.getModuleGroupParamValue( + QStringList configPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + QString mediaProcessingDefinitionFile = QString::fromUtf8(configuration.getModuleGroupParamValue( "lima-coreclient", "mediaProcessingDefinitionFiles", - *langItr); + *langItr).c_str()); + if (QFileInfo(confPath + "/" + mediaProcessingDefinitionFile).exists()) + { + file= (confPath + "/" + mediaProcessingDefinitionFile).toUtf8().constData(); + break; + } + } } catch (NoSuchParam& ) { LERROR << "no language definition file for language " << *langItr; throw InvalidConfiguration("no language definition file for language "); } - + if (file.empty()) + { + LERROR << "no language definition file for language " << *langItr; + throw InvalidConfiguration("no language definition file for language "); + } XMLConfigurationFileParser langParser(file); //initialize SpecificEntities @@ -331,9 +358,9 @@ void CoreLinguisticProcessingClientFactory::configure( } } -AbstractLinguisticProcessingClient* CoreLinguisticProcessingClientFactory::createClient() const +std::shared_ptr< AbstractProcessingClient > CoreLinguisticProcessingClientFactory::createClient() const { - return new CoreLinguisticProcessingClient(); + return std::shared_ptr< AbstractProcessingClient >(new CoreLinguisticProcessingClient()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h index 3e1ee072c..9770305ad 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h @@ -49,15 +49,23 @@ class LIMA_CORELINGUISTICPROCESSINGCLIENT_EXPORT CoreLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const - ; +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const +#else + const std::set& inactiveUnits = std::set()) const +#endif +; void analyze(const std::string& texte, const std::map& metaData, const std::string& pipeline, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const +#else const std::set& inactiveUnits = std::set()) const - ; +#endif +; }; class CoreLinguisticProcessingClientFactory : public AbstractLinguisticProcessingClientFactory @@ -70,13 +78,13 @@ class CoreLinguisticProcessingClientFactory : public AbstractLinguisticProcessin std::deque langs, std::deque pipelines); - AbstractLinguisticProcessingClient* createClient() const; + std::shared_ptr< AbstractProcessingClient > createClient() const; virtual ~CoreLinguisticProcessingClientFactory(); private: CoreLinguisticProcessingClientFactory(); - static CoreLinguisticProcessingClientFactory* s_instance; + static std::unique_ptr s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp index 077ef398c..6ff6e112e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp @@ -292,7 +292,7 @@ LimaStatusCode CorefSolver::process( * function */ if (annotationData->dumpFunction("Coreferent") == 0) { - annotationData->dumpFunction("Coreferent", new DumpCoreferent()); + annotationData->dumpFunction("Coreferent", new DumpCoreferent(annotationData)); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp index 7ca3bb1e4..35b720d4b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp @@ -55,7 +55,7 @@ int DumpCoreferent::dump(std::ostream& os, Common::AnnotationGraphs::GenericAnno PROCESSORSLOGINIT; try { - ga.value().dump(os); + ga.value().dump(os, m_ad); return SUCCESS_ID; } catch (const boost::bad_any_cast& ) @@ -1111,8 +1111,8 @@ bool CoreferentAnnotation::aba5( // COREFSOLVERLOGINIT; // LDEBUG << "aba5"; LinguisticGraph* graph = anagraph->getGraph(); - LinguisticCode L_NC = (*tagLocalDef.find("NomCommunMacroCategory")).second; - LinguisticCode L_NP = (*tagLocalDef.find("NomPropreMacroCategory")).second; + LinguisticCode NC = (*tagLocalDef.find("NomCommunMacroCategory")).second; + LinguisticCode NP = (*tagLocalDef.find("NomPropreMacroCategory")).second; bool res = false; DependencyGraphVertex* qv = new DependencyGraphVertex(); if (ca.isDeterminer(qv,sd, relLocalDef, language, anagraph, ac)) @@ -1120,7 +1120,7 @@ bool CoreferentAnnotation::aba5( MorphoSyntacticData* data = get(vertex_data,*graph,sd->tokenVertexForDepVertex(*qv)); if (data ==0 || data->empty()) { return false; }; // if *qv is a noun - if (data->firstValue(*macroAccessor) == L_NC || data->firstValue(*macroAccessor) == L_NP) + if (data->firstValue(*macroAccessor) == NC || data->firstValue(*macroAccessor) == NP) { // if Q is in the argument domain of N, CoreferentAnnotation caQ(0,*qv); @@ -1307,6 +1307,49 @@ AnnotationGraphVertex CoreferentAnnotation::writeAnnotation( return AnnotationGraphVertex(); //unused; } +DumpCoreferent::DumpCoreferent(const Lima::Common::AnnotationGraphs::AnnotationData* ad) : + Common::AnnotationGraphs::AnnotationData::Dumper(), + m_ad(ad) +{ +} + +void CoreferentAnnotation::dump(std::ostream& os, const Common::AnnotationGraphs::AnnotationData* ad) const +{ + os << "#" << m_id << ";" << m_categ<< ";" /*<< "V:" << m_morphVertex */; + CoreferentAnnotation antecedent; + bool hasAntecedent = false; + std::set< AnnotationGraphVertex > matches = ad->matches("PosGraph",m_morphVertex,"annot"); + if (matches.empty()) + { + COREFSOLVERLOGINIT; + LERROR << "CoreferentAnnotation::dump No annotation graph vertex matches PoS graph vertex " << m_morphVertex << ". This should not happen."; + return ; + } + AnnotationGraphVertex av = *matches.begin(); + AnnotationGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = boost::out_edges(av, ad->getGraph()); + if (it != it_end) + { + for (; it != it_end; it++) + { + GenericAnnotation ga = ad->annotation(boost::target(*it, ad->getGraph()), utf8stdstring2limastring("Coreferent")); + try + { + antecedent = ga.value(); + hasAntecedent = true; + break; + } + catch (const boost::bad_any_cast& ) + { + continue; + } + } + } + if (hasAntecedent) + { + os << "#" << antecedent.id(); + } +} void CoreferentAnnotation::outputXml(std::ostream& xmlStream,const LinguisticGraph& g, const AnnotationData* ad) const { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h index 238970472..7134fdabc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h @@ -113,7 +113,7 @@ typedef std::map< CoreferentAnnotation*,std::map > inline void morphVertex(LinguisticGraphVertex v); inline void av(AnnotationGraphVertex av); inline void newerRef(CoreferentAnnotation* newerRef); - inline void dump(std::ostream& os); + void dump( std::ostream& os, const Lima::Common::AnnotationGraphs::AnnotationData* ad ) const; inline bool hasNewerRef(/*std::deque* npCandidates*/); /** general test functions */ @@ -514,10 +514,6 @@ inline void CoreferentAnnotation::newerRef(CoreferentAnnotation* newerRef) { m_newerRef = newerRef; } -inline void CoreferentAnnotation::dump(std::ostream& os) -{ - os << "#" << m_id << ";" << m_categ<< ";" << /*"V:" << m_morphVertex <<*/ "\n"; -} inline bool CoreferentAnnotation::hasNewerRef() { return (newerRef()!=this); @@ -537,8 +533,13 @@ return (newerRef()!=this); */ class DumpCoreferent : public Common::AnnotationGraphs::AnnotationData::Dumper { - public: - virtual int dump(std::ostream& os, Common::AnnotationGraphs::GenericAnnotation& ga) const; +public: + DumpCoreferent(const Lima::Common::AnnotationGraphs::AnnotationData* ad); + virtual int dump(std::ostream& os, Common::AnnotationGraphs::GenericAnnotation& ga) const; + +private: + const Lima::Common::AnnotationGraphs::AnnotationData* m_ad; + }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp index 2431495f6..4c569590f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp @@ -30,6 +30,7 @@ #include "common/linguisticData/languageData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/misc/FileUtils.h" #include "common/misc/strwstrtools.h" // #include "linguisticProcessing/core/Tokenizer/Exceptions.h" @@ -96,20 +97,22 @@ void DictionaryCode::init( #endif m_language=manager->getInitializationParameters().language; std::string resourcesPath=Common::LinguisticData::LinguisticData::single().getResourcesPath(); - std::string codesListFileName; - std::string codeFileName; - try - { - codesListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("codeListFile"); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'codeListFile' in DictionaryCode group for language " << (int) m_language; - throw InvalidConfiguration(); - } +// QString codesListFileName; +// try +// { +// codesListFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("codeListFile").c_str()); +// } +// catch (NoSuchParam& ) +// { +// LERROR << "no param 'codeListFile' in DictionaryCode group for language " << (int) m_language; +// throw InvalidConfiguration(); +// } +// loadCodesMaps(codesListFileName); + + QString codeFileName; try { - codeFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("codeFile"); + codeFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("codeFile").c_str()); } catch (NoSuchParam& ) { @@ -117,8 +120,7 @@ void DictionaryCode::init( throw InvalidConfiguration(); } -// loadCodesMaps(codesListFileName); - parse(codeFileName); + parse(codeFileName.toUtf8().constData()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp index 9289b5656..51f21430b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplate.cpp @@ -38,6 +20,13 @@ m_mainEvent(false) { } +EventTemplate::EventTemplate(const std::string type): +m_template(), +m_weight(0.0), +m_type(type), +m_mainEvent(false) +{ +} EventTemplate::~EventTemplate() { } @@ -61,7 +50,7 @@ const EventTemplateElement& EventTemplate::getElement(const std::string& role) c it=m_template.find(role); if (it==m_template.end()) { LOGINIT("LP::EventAnalysis"); - LERROR << "No element '" << role << "' in EventTemplate"; + LERROR << "No element '" << role << "' in EventTemplate" << LENDL; return emptyElement; } return (*it).second; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h index 070315a14..070f6c11e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplate.h @@ -46,11 +28,13 @@ typedef std::string EventRole; class LIMA_EVENTANALISYS_EXPORT EventTemplate { public: - EventTemplate(); + EventTemplate(); + EventTemplate(const std::string); ~EventTemplate(); void addElement(const std::string& role, const EventTemplateElement& elt); void setWeight(double w) { m_weight=w; } + void setType(const std::string type) { m_type=type; } void setMain(bool isMainEvent) { m_mainEvent=isMainEvent; } void clear(); @@ -58,12 +42,14 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplate const std::map& getTemplateElements() const { return m_template; } std::map& getTemplateElements() { return m_template; } double getWeight() const { return m_weight; } + const std::string getType() const { return m_type; } bool isMainEvent() const { return m_mainEvent; } private: std::map m_template; double m_weight; bool m_mainEvent; + std::string m_type; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp index b5a5ccfc7..06961be37 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp @@ -67,6 +67,14 @@ void EventTemplateData::clearCurrentTemplate() back().clear(); } +void EventTemplateData::setTypeInCurrentTemplate(const std::string& type) +{ + LOGINIT("LP::EventAnalysis"); + LDEBUG << "set Current Template Type " << type << LENDL; + back().setType(type); + LDEBUG << "bak.getType " << back().getType() << LENDL; +} + //------------------------------------------------------------------------------- // conversion to Events (for compatibility with EventExtraction web service) Events* EventTemplateData:: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h index d9a29adad..7469eb71a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h @@ -51,6 +51,7 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateData : public AnalysisData, public void addTemplate(); void addElementInCurrentTemplate(const std::string& role, const EventTemplateElement& elt); void clearCurrentTemplate(); + void setTypeInCurrentTemplate(const std::string&); Events* convertToEvents(const Common::AnnotationGraphs::AnnotationData* annotationData) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp index baeb1937c..1cc6cfeeb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp @@ -135,7 +135,9 @@ void EventTemplateDataXmlLogger::outputEventData(std::ostream& out, i++; out << " " << endl; + << " main=\"" << (*it).isMainEvent() << "\"" + << " type=\"" << (*it).getType() << "\">" + << endl; int j=0; out << " " << endl; for(map::const_iterator it1= templateElements.begin(); it1!= templateElements.end();it1++) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp index a50d9c1ae..c1f059ae8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateDefinitionResource.cpp @@ -44,12 +26,49 @@ EventTemplateDefinitionResourceFactory(EVENTTEMPLATEDEFINITIONRESOURCE_CLASSID); //---------------------------------------------------------------------- EventTemplateDefinitionResource::EventTemplateDefinitionResource(): -m_language(0) +m_language(0), +m_templates(), +m_elementMapping() { } EventTemplateDefinitionResource::~EventTemplateDefinitionResource() { } +const std::string& EventTemplateDefinitionResource::getMention (const std::string name) const +{ +#ifdef ANTINNO_SPECIFIC + // pour viter erreur c4172 + static std::string const mention=""; +#else + std::string mention=""; +#endif + LOGINIT("LP::EventAnalysis"); + LDEBUG << "getMention m_templates.size() " << m_templates.size(); + for(std::vector::const_iterator it=m_templates.begin();it!=m_templates.end();it++) + { + LDEBUG << "Cuurent Mention " << it->getMention()<< LENDL; + if (name.compare(it->getName())==0) return it->getMention(); + } + return mention; +} + +const std::map& EventTemplateDefinitionResource::getStructure (const std::string name) const +{ +#ifdef ANTINNO_SPECIFIC + // pour viter erreur c4172 + static std::map const structure; +#else + std::map structure; +#endif + LOGINIT("LP::EventAnalysis"); + LDEBUG << "getMention m_templates.size() " << m_templates.size(); + for(std::vector::const_iterator it=m_templates.begin();it!=m_templates.end();it++) + { + //LDEBUG << "Cuurent Mention " << it->getMention()<< LENDL; + if (name.compare(it->getName())==0) return it->getStructure(); + } + return structure; +} //---------------------------------------------------------------------- void EventTemplateDefinitionResource:: @@ -60,25 +79,38 @@ init(GroupConfigurationStructure& unitConfiguration, LOGINIT("LP::EventAnalysis"); m_language=manager->getInitializationParameters().language; - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - EventTemplateStructure structure; // get name try { string name = unitConfiguration.getParamsValueAtKey("templateName"); structure.setName(name); + LDEBUG << "Template name = "<< name; + } catch (NoSuchParam& ) { LERROR << "No param 'templateName' in EventTemplateDefinitionResource for language " << (int)m_language; throw InvalidConfiguration(); } + try{ + + string nameMention = unitConfiguration.getParamsValueAtKey("templateMention"); + LDEBUG << "Template mention = "<< nameMention; + structure.setMention(nameMention); + } + + catch (NoSuchParam& ) { + LERROR << "No param 'templateMention' in EventTemplateDefinitionResource for language " << (int)m_language; + //throw InvalidConfiguration(); + } // get template elements: role and entity types try { map elts = unitConfiguration.getMapAtKey("templateElements"); + LDEBUG << "templateElements .size " << elts.size(); for(map::const_iterator it=elts.begin(),it_end=elts.end();it!=it_end;it++) { + LDEBUG << "templateElement =" << (*it).first; structure.addTemplateElement((*it).first,(*it).second); } } @@ -88,9 +120,11 @@ init(GroupConfigurationStructure& unitConfiguration, } // get element mapping, for template merging + LDEBUG << "get elementMapping "; try { map mapping = unitConfiguration.getMapAtKey("elementMapping"); + LDEBUG << "after Getting map "; for(map::const_iterator it=mapping.begin(),it_end=mapping.end();it!=it_end;it++) { const std::string& elements=(*it).second; // comma-separated list of elements @@ -102,10 +136,10 @@ init(GroupConfigurationStructure& unitConfiguration, } } } - catch (NoSuchParam& ) { + catch (NoSuchMap& ) { LDEBUG << "No param 'elementMapping' in EventTemplateDefinition for language " << (int)m_language; } - + LDEBUG << "Adding Structure "; m_templates.push_back(structure); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h index 7e1dd2234..72f9f5d1e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateDefinitionResource.h @@ -54,6 +36,8 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateDefinitionResource : public Abstrac // mapping is oriented, return 1 if mapping elt1 -> elt2, -1 if mapping elt2 -> elt1, 0 otherwise int existsMapping(const std::string& eltName1, const std::string& eltName2) const; + const std::string& getMention(const std::string) const; + const std::map& getStructure(const std::string) const; private: MediaId m_language; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp index f67ce99c8..f1d816e95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp @@ -135,8 +135,13 @@ bool AddTemplateElement::operator()(const LinguisticAnalysisStructure::AnalysisG //---------------------------------------------------------------------- CreateEventTemplate::CreateEventTemplate(MediaId language, const LimaString& complement): -Automaton::ConstraintFunction(language,complement) +Automaton::ConstraintFunction(language,complement), +m_eventType() { + LOGINIT("LP::EventAnalysis"); + LDEBUG << "Complement " << complement << LENDL; + m_eventType=Common::Misc::limastring2utf8stdstring(complement); + LDEBUG << "m_event_type " << m_eventType << LENDL; } bool CreateEventTemplate::operator()(AnalysisContent& analysis) const @@ -151,6 +156,9 @@ bool CreateEventTemplate::operator()(AnalysisContent& analysis) const LDEBUG << "CreateEventTemplate"; // validate current template by creating a new empty template which will be new current template + LDEBUG << "setTypeInCurrentTemplate" << m_eventType<setTypeInCurrentTemplate(m_eventType); + eventData->addTemplate(); return true; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h index c83e55bac..11f269418 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h @@ -72,8 +72,8 @@ class LIMA_EVENTANALISYS_EXPORT CreateEventTemplate : public Automaton::Constrai ~CreateEventTemplate() {} bool operator()(AnalysisContent& analysis) const; - //bool actionNeedsRecognizedExpression() { return true; } private: + std::string m_eventType; }; class LIMA_EVENTANALISYS_EXPORT ClearEventTemplate : public Automaton::ConstraintFunction diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp index 6d773c9bc..0407a6f3b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateStructure.cpp @@ -47,7 +29,7 @@ void EventTemplateStructure::addTemplateElement(const std::string& role, { if (m_structure.find(role)!=m_structure.end()) { LOGINIT("LP::EventAnalysis"); - LERROR << "In event " << m_name << ", element '"<< role <<"' is defined twice" ; + LERROR << "In event " << m_name << ", element '"<< role <<"' is defined twice" << LENDL; } else { Common::MediaticData::EntityType type= diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h index 4fe352bf0..8e5d09aff 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateStructure.h @@ -49,15 +31,18 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateStructure ~EventTemplateStructure(); void setName(const std::string& name) { m_name=name; } + void setMention(const std::string& name) { m_mention=name; } void addTemplateElement(const std::string& role, const std::string entityType); const std::string& getName(void) const { return m_name; } + const std::string& getMention(void) const { return m_mention; } const std::map& getStructure(void) const { return m_structure; } private: std::string m_name; std::map m_structure; + std::string m_mention; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp index ea6caa1f0..1cb913865 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp @@ -34,6 +34,7 @@ #include "State.h" #include "common/misc/Exceptions.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" @@ -77,10 +78,8 @@ void Automaton::init( MediaId language=manager->getInitializationParameters().language; try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string charChartFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("automatonFile"); - loadFromFile(charChartFileName); - + QString charChartFileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("automatonFile").c_str()); + loadFromFile(charChartFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { LERROR << "no parameter 'automatonFile' in tokenizer group for language " << (int) language << " !"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp index 3daeddb77..a33fde0e8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp @@ -33,6 +33,7 @@ #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include @@ -150,9 +151,8 @@ void CharChart::init( MediaId language=manager->getInitializationParameters().language; try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string charChartFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("charFile"); - loadFromFile(charChartFileName); + QString charChartFileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("charFile").c_str()); + loadFromFile(charChartFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp index 8ce59fa70..062d34b4a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp @@ -35,6 +35,7 @@ #include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" #include @@ -111,10 +112,9 @@ void Tokenizer::init( try { - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string fileName=resourcesPath +"/"+unitConfiguration.getParamsValueAtKey("automatonFile"); + QString fileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("automatonFile").c_str()); m_d->_automaton.setCharChart(m_d->_charChart); - m_d->_automaton.loadFromFile(fileName); + m_d->_automaton.loadFromFile(fileName.toUtf8().constData()); } catch (NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp index 57fed9350..e7138b78b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp @@ -84,50 +84,6 @@ void TokenizerAutomaton::init( } m_text=new Text(_language,_charChart); - - try - { - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string fileName=resourcesPath +"/"+unitConfiguration.getParamsValueAtKey("automatonFile"); - - } - catch (NoSuchParam& ) - { - LERROR << "no param 'automatonFile' in TokenizerAutomaton group configuration (language=" - << (int) _language << ")"; - throw InvalidConfiguration(); - } - // when input XML file is syntactically wrong - catch (XmlSyntaxException exc) - { - std::ostringstream mess; - mess << "XmlSyntaxException at line "< #include "common/misc/depth_first_searchnowarn.hpp" #include -#include +#include //========== defines diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp index b438d2bde..8d4243a71 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.cpp @@ -40,33 +40,6 @@ namespace LinguisticProcessing namespace LinguisticAnalysisStructure { -LinguisticElement::LinguisticElement() : - inflectedForm(0), - lemma(0), - normalizedForm(0), - properties(0), - type(NO_MORPHOSYNTACTICTYPE) - -{ -} -LinguisticElement::LinguisticElement(const LinguisticElement& le) : - inflectedForm(le.inflectedForm), - lemma(le.lemma), - normalizedForm(le.normalizedForm), - properties(le.properties), - type(le.type) -{ -} -LinguisticElement& LinguisticElement::operator=(const LinguisticElement& le) -{ - inflectedForm = le.inflectedForm; - lemma = le.lemma; - normalizedForm = le.normalizedForm; - properties = le.properties; - type = le.type; - return *this; -} - bool LinguisticElement::operator==(const LinguisticElement& le) const { return ((properties==le.properties) && diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h index 2c9aba2d0..745b97737 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h @@ -57,10 +57,6 @@ enum MorphoSyntacticType { }; struct LIMA_LINGUISTICANALYSISSTRUCTURE_EXPORT LinguisticElement { - LinguisticElement(); - LinguisticElement(const LinguisticElement& le); - LinguisticElement& operator=(const LinguisticElement& le); - StringsPoolIndex inflectedForm; StringsPoolIndex lemma; StringsPoolIndex normalizedForm; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp index 9613f92d2..f49a267c9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp @@ -128,18 +128,21 @@ initialize(AnalysisContent& analysis) const if (! m_temporaryFileMetadata.isEmpty()) { #ifdef DEBUG_LP - LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile; + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with temporary file metadata"; #endif LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "no LinguisticMetaData ! abort"; } +#ifdef DEBUG_LP + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with metadata value"<< metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); +#endif return new DumperStream(metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()),m_append); } if (! m_outputFile.empty()) { #ifdef DEBUG_LP - LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile; + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file"<< m_outputFile << m_append; #endif return new DumperStream(m_outputFile,m_append); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp index a2b1e11cb..52cb5be63 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp @@ -1,3 +1,153 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file AnalysisLoader.cpp + * @author Romaric Besancon (romaric.besancon@cea.fr) + * @date Tue Jan 18 2011 + * copyright Copyright (C) 2011 by CEA LIST + * + ***********************************************************************/ + +#include "AnalysisLoader.h" + +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +namespace Lima { +namespace LinguisticProcessing { + +SimpleFactory AnalysisLoaderFactory(ANALYSISLOADER_CLASSID); + +//*********************************************************************** +// constructors and destructors +AnalysisLoader::AnalysisLoader(): +MediaProcessUnit(), +m_inputFileName(), +m_inputFileExtension(), +m_temporaryFileMetadata() +{ +} + +AnalysisLoader::~AnalysisLoader() { +} + +//*********************************************************************** +void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* /*manager*/) + +{ + LOGINIT("LP::AnalysisLoader"); + LDEBUG << "Initialization"; + + bool parameterFound(false); + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + + try { + m_inputFileName=unitConfiguration.getParamsValueAtKey("inputFile"); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + } + + try { + m_inputFileExtension=unitConfiguration.getParamsValueAtKey("inputSuffix"); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + } + + if (! parameterFound) { + LERROR << "No 'inputFile' or 'inputSuffix' or 'temporaryFileMetadata' parameter in AnalysisLoader"; + throw InvalidConfiguration(); + } + +} + +const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const +{ + static std::string inputFile(""); + if (! m_temporaryFileMetadata.isEmpty()) { + // get temporary filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'temporaryFileMetadata' parameter for AnalysisLoader"; + return inputFile; + } + + inputFile = metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); + return inputFile; + } + else if (! m_inputFileName.empty()) { + return m_inputFileName; + } + else if (! m_inputFileExtension.empty()) { + // get filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'inputSuffix' parameter for AnalysisLoader"; + return inputFile; + } + + std::string textFileName = metadata->getMetaData("FileName"); + inputFile = textFileName + m_inputFileExtension; + return inputFile; + } + LOGINIT("LP::AnalysisLoader"); + LERROR << "No 'inputFile' found in AnalysisLoader"; + return inputFile; +} + + +} // end namespace +} // end namespace + + + + +#else + + + +// version master + + + /* Copyright 2002-2013 CEA LIST @@ -126,3 +276,7 @@ const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const } // end namespace } // end namespace + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h index f1cef4219..2cf11d21b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h @@ -1,3 +1,98 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file AnalysisLoader.h + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * Project MM + * + * @brief abstract class for analysis loaders + * + * + ***********************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSING_ANALYSISLOADER_H +#define LIMA_LINGUISTICPROCESSING_ANALYSISLOADER_H + +#include "LinguisticProcessorsExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include +#include + +namespace Lima { +namespace LinguisticProcessing { + +#define ANALYSISLOADER_CLASSID "AnalysisLoader" + +/* + * @brief this is the abstract class for analysis loaders, that read + * informations from external files to insert them in the analysis + * data + */ +class LIMA_LINGUISTICPROCESSORS_EXPORT AnalysisLoader : public MediaProcessUnit +{ +public: + AnalysisLoader(); + + virtual ~AnalysisLoader(); + + void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& /*analysis*/) const { return SUCCESS_ID; } + + const std::string& getInputFile(AnalysisContent& analysis) const; + +protected: + std::string m_inputFileName; + std::string m_inputFileExtension; + QString m_temporaryFileMetadata; +}; + +} // end namespace +} // end namespace + +#endif + + + + +#else + + +// version master + + + + /* Copyright 2002-2013 CEA LIST @@ -71,3 +166,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT AnalysisLoader : public MediaProcessUnit } // end namespace #endif + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp index baf70183c..c64202cfc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp @@ -1,3 +1,11 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + /* Copyright 2002-2013 CEA LIST @@ -238,3 +246,215 @@ LimaStatusCode ExternalProcessUnit::process(AnalysisContent& analysis) const } // end namespace } // end namespace + + + +#else + + +// version master + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file ExternalProcessUnit.cpp + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * + ***********************************************************************/ + +#include "ExternalProcessUnit.h" + +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/time/traceUtils.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" + +//#include "boost/process.hpp" +#include + +#include + +//namespace bp = ::boost::process; + +using namespace std; + +namespace Lima { +namespace LinguisticProcessing { + +SimpleFactory ExternalProcessUnitFactory(EXTERNALPROCESSUNIT_CLASSID); + +ExternalProcessUnit::ExternalProcessUnit(): +MediaProcessUnit(), +m_dumper(), +m_loader(), +m_commandLine(), +m_inputSuffix(), +m_outputSuffix() +{ +} + +ExternalProcessUnit::~ExternalProcessUnit() +{ +} + + +void ExternalProcessUnit::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + LOGINIT("LP::External"); + LDEBUG << "Initialization"; + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'dumper' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'loader' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + try { + m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + try { + m_commandLine=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("command").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'command' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } +} + +LimaStatusCode ExternalProcessUnit::process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + LOGINIT("LP::External"); + LINFO << "ExternalProcessUnit: start"; + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + + LimaStatusCode returnCode(SUCCESS_ID); + + // produce temporary file with the given dumper + LDEBUG << "ExternalProcessUnit: write tmp file"; + returnCode=m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "ExternalProcessUnit: failed to dump data to temporary file"; + return returnCode; + } + + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename, outputFilename; + // apply command line + LDEBUG << "ExternalProcessUnit: apply external program"; + QString commandLine = m_commandLine; + if (!m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_inputSuffix; + } + if (!m_outputSuffix.isEmpty()) + { + outputFilename = fileName + m_outputSuffix; + } + commandLine = commandLine.arg(inputFilename).arg(outputFilename); + LDEBUG << "Launching " << commandLine; + int processResult = QProcess::execute(commandLine); + switch (processResult) { + case -2 : + LERROR << "ExternalProcessUnit: Was not able to start '" << commandLine << "'" ; + return returnCode; + case -1 : + LERROR << "ExternalProcessUnit: '" << commandLine << "' crashed!"; + return returnCode; + case 0 : + break; + default: + LERROR << "ExternalProcessUnit: '" << commandLine << "' returned error status:" << processResult; + return returnCode; + } + + if (m_loader != 0) { + // load results from the external program with the given loader + LDEBUG << "ExternalProcessUnit: read results"; + returnCode=m_loader->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "ExternalProcessUnit: failed to load data from temporary file"; + return returnCode; + } + } + else { + LWARN << "ExternalProcessUnit: no loader defined for the current external process unit"; + } + + TimeUtils::logElapsedTime("ExternalProcessUnit"); + return returnCode; +} + + +} // end namespace +} // end namespace + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h index 0c09a7cbc..5ceb62b45 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h @@ -1,3 +1,12 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + /* Copyright 2002-2013 CEA LIST @@ -77,3 +86,94 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT ExternalProcessUnit : public MediaProcess } // end namespace #endif + + + + +#else + + + + +// version master + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file externalProcessUnit.h + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * Project MM + * + * @brief this class contains a generic process unit that use a system call + * to let an external process do the job. + * + * + ***********************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSING_EXTERNALPROCESSUNIT_H +#define LIMA_LINGUISTICPROCESSING_EXTERNALPROCESSUNIT_H + +#include "LinguisticProcessorsExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "linguisticProcessing/client/AnalysisHandlers/AbstractTextualAnalysisHandler.h" + +namespace Lima { +namespace LinguisticProcessing { + +#define EXTERNALPROCESSUNIT_CLASSID "ExternalProcessUnit" + +/* + * @brief this class contains a generic process unit that use a system + * call to let an external process do the job. The input for this + * external program is produced by a dumper given as a parameter, + * and the output is read by a Loader also given as a parameter + */ +class LIMA_LINGUISTICPROCESSORS_EXPORT ExternalProcessUnit : public MediaProcessUnit +{ +public: + ExternalProcessUnit(); + + virtual ~ExternalProcessUnit(); + + void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& analysis) const; + +private: + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_commandLine; + QString m_inputSuffix; + QString m_outputSuffix; +}; + +} // end namespace +} // end namespace + +#endif + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp index 36a99aa9f..8107e9467 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp @@ -29,6 +29,7 @@ #include "LinguisticMetaData.h" #include "LimaStringText.h" +#include "common/Data/strwstrtools.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/time/traceUtils.h" @@ -46,12 +47,13 @@ namespace LinguisticProcessing SimpleFactory statusLoggerFactory(STATUSLOGGER_CLASSID); -StatusLogger::StatusLogger() +StatusLogger::StatusLogger() {} StatusLogger::~StatusLogger() -{} +{ +} void StatusLogger::init( @@ -75,7 +77,7 @@ void StatusLogger::init( { outputFile=string("status.log"); } - m_out= new ofstream(outputFile.c_str(), std::ofstream::binary); + m_out = std::unique_ptr< std::ofstream >(new ofstream(outputFile.c_str(), std::ofstream::binary)); try { deque tolog=unitConfiguration.getListsValueAtKey("toLog"); @@ -118,7 +120,7 @@ LimaStatusCode StatusLogger::process( string line; while (!statusIn.eof()) { - getline(statusIn,line); + line = Lima::Common::Misc::readLine(statusIn); size_t index=line.find(":"); string key=line.substr(0,index); if (m_toLog.find(key)!=m_toLog.end()) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h index bb2b2873b..98d35fb25 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h @@ -61,7 +61,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT StatusLogger : public MediaProcessUnit private: - std::ostream* m_out; + std::unique_ptr< std::ofstream > m_out; std::set m_toLog; std::string m_statusFile; uint64_t m_predTime; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp index 25ff78cd8..b3da33aea 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp @@ -27,8 +27,11 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "common/AbstractFactoryPattern/Singleton.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/core/AnalysisDict/AbstractAccessResource.h" +#include + using namespace std; using namespace Lima::Common::XMLConfigurationFiles; @@ -153,8 +156,21 @@ includeResources(Common::XMLConfigurationFiles::ModuleConfigurationStructure& mo #ifdef DEBUG_LP LDEBUG << "i="<< i; #endif - fileName=Common::MediaticData::MediaticData::single().getConfigPath()+ - "/"+string((*it),0,i); + QStringList configPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + if (QFileInfo(confPath + "/" + string((*it),0,i).c_str()).exists()) + { + + fileName = (confPath + "/" + string((*it),0,i).c_str()).toUtf8().constData(); + break; + } + } + if (fileName.empty()) + { + LERROR << "No resources" << *it << "found in" << Common::MediaticData::MediaticData::single().getConfigPath(); + continue; + } moduleName=string((*it),i+1); LINFO << "includeResources filename="<< fileName << "moduleName="<< moduleName; XMLConfigurationFileParser parser(fileName); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp index 1043fc1bc..ec0c5f90a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp @@ -30,6 +30,7 @@ #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" #include "common/MediaProcessors/MediaProcessors.h" // #include "linguisticProcessing/common/linguisticData/linguisticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/time/traceUtils.h" @@ -64,7 +65,7 @@ void Modex::init(GroupConfigurationStructure& unitConfiguration, try { // try to get a single automaton string filename=unitConfiguration.getParamsValueAtKey("modexConfig"); - string configFile=LinguisticData::single().getConfigPath()+"/"+filename; + string configFile=Common::Misc::findFileInPaths(LinguisticData::single().getConfigPath().c_str(),filename.c_str()).toUtf8().constData(); initModex(configFile,m_language); } @@ -156,7 +157,7 @@ addConfiguration(ModuleConfigurationStructure& modexConfig, void Modex:: initEntities(const std::string& filename) { - XMLConfigurationFileParser configuration(LinguisticData::single().getConfigPath() + "/" + filename); + XMLConfigurationFileParser configuration(Common::Misc::findFileInPaths(LinguisticData::single().getConfigPath().c_str(),filename.c_str()).toUtf8().constData()); initEntities(configuration); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp index 5fd837217..65702f83c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp @@ -77,8 +77,19 @@ namespace MorphologicAnalysis SimpleFactory abbreviationSplitAlternativesFactory(ABBREVIATIONSPLITALTERNATIVESFACTORY_CLASSID); AbbreviationSplitAlternatives::AbbreviationSplitAlternatives() : - m_reader(0) -{} +m_tokenizer(0), +m_dictionary(0), +m_abbreviations(), +m_language(), +m_confidentMode(true), +m_reader(0), +m_charSplitRegexp() +{ + // default split regexp: split on simple quote or UTF-8 right quotation mark + LimaString quotes=Common::Misc::utf8stdstring2limastring("[']"); + m_charSplitRegexp=QRegExp(quotes); + +} AbbreviationSplitAlternatives::~AbbreviationSplitAlternatives() { @@ -156,6 +167,19 @@ void AbbreviationSplitAlternatives::init( LWARN << "use default value : 'true'"; m_confidentMode=true; } + + try + { + string charSplit=unitConfiguration.getParamsValueAtKey("charSplitRegexp"); + m_charSplitRegexp=QRegExp(Common::Misc::utf8stdstring2limastring(charSplit)); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in AbbreviationSplitAlternatives group for language " << (int) m_language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); m_reader=new AlternativesReader(m_confidentMode,true,true,true,charChart,sp); @@ -168,6 +192,9 @@ LimaStatusCode AbbreviationSplitAlternatives::process( MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process AbbreviationSplitAlternatives"; +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); LinguisticGraph* graph=tokenList->getGraph(); @@ -193,6 +220,14 @@ LimaStatusCode AbbreviationSplitAlternatives::process( boost::tie(it, it_end) = vertices(*graph); for (; it != it_end; it++) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + MORPHOLOGINIT + LERROR << "Analyze too long. Stopped in AbbreviationSplitAlternatives"; + return TIME_OVERFLOW; + } +#endif MorphoSyntacticData* currentData = dataMap[*it]; if (currentData == 0) continue; Token* currentToken= tokenMap[*it]; @@ -268,8 +303,12 @@ bool AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternative const LimaString& ft = ftok->stringForm(); LDEBUG << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor " << Common::Misc::limastring2utf8stdstring(ft); - int aposPos = ft.indexOf(Common::Misc::utf8stdstring2limastring("'"), 0); - if (aposPos==-1 || aposPos==0) return false; + //int aposPos = ft.indexOf(Common::Misc::utf8stdstring2limastring("'"), 0); + int aposPos = ft.indexOf(m_charSplitRegexp, 0); + //LDEBUG << "AbbreviationSplitAlternatives: split chars found at " << aposPos; + if (aposPos==-1 || aposPos==0) { + return false; + } LimaString beforeAbbrev(ft.left(aposPos-1)); std::vector< LimaString >::const_iterator itAbb = m_abbreviations.begin(); @@ -407,7 +446,8 @@ bool AbbreviationSplitAlternatives::makePossessiveAlternativeFor( const LimaString& ft = ftok->stringForm(); LDEBUG << "AbbreviationSplitAlternatives::makePossessiveAlternativeFor " << Common::Misc::limastring2utf8stdstring(ft); - int aposPos = ft.indexOf(LimaChar('\''), 0); + //int aposPos = ft.indexOf(LimaChar('\''), 0); + int aposPos = ft.indexOf(m_charSplitRegexp, 0); if (aposPos==-1 || aposPos==0) return false; LimaString possessivedWord(ft.left(aposPos)); LDEBUG << "AbbreviationSplitAlternatives::makePossessiveAlternativeFor possesive word: " << Common::Misc::limastring2utf8stdstring(possessivedWord); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h index 2426f91f3..1b3a84808 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h @@ -86,6 +86,7 @@ class LIMA_MORPHOLOGICANALYSIS_EXPORT AbbreviationSplitAlternatives : public Med MediaId m_language; bool m_confidentMode; AlternativesReader* m_reader; + QRegExp m_charSplitRegexp; bool makeConcatenatedAbbreviationSplitAlternativeFor( LinguisticGraphVertex splitted, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp index a2c7660da..287fb1a16 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AlternativesReader.cpp @@ -159,6 +159,7 @@ void AlternativesReader::readAlternatives( #ifdef DEBUG_LP LDEBUG << "-> StringPool returned index " << idx; #endif + token.addOrthographicAlternatives(idx); DictionaryEntry entry=dico.getEntry(idx,unmarked); #ifdef DEBUG_LP LDEBUG << "entry.isEmpty:" << entry.isEmpty(); @@ -166,7 +167,6 @@ void AlternativesReader::readAlternatives( if (!entry.isEmpty()) { - token.addOrthographicAlternatives(idx); #ifdef DEBUG_LP LDEBUG << "confident mode: " << m_confidentMode; LDEBUG << "lingInfosHandler: " << (void*)accentedHandler << " entry.hasLingInfos:" << entry.hasLingInfos(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp index 5465fcceb..7907b1236 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp @@ -77,11 +77,20 @@ void ConcatenatedDataHandler::foundComponent(uint64_t position, uint64_t length, } concatenated.push_back(componentVertex); +#ifdef ANTINNO_SPECIFIC + // FWI 04/04/2016 + // plantage de l'indexeur sauf si les 4 lignes suivantes sont commentes + // apparement le fait de dtruire *this plante nt.dll sans que la cause soit vidente + // pour test : voir le doc "constitution 2011" en ARA sur la machine "lirac" + // sur ma machine a ne plante pas systmatiquement... + // A noter : dsactiver le paramtre "parseConcatenated" dans SimpleWord permet de courtcircuiter le problme + // -> investiguer +#endif m_currentToken=new Token(form,(*m_stringsPool)[form],m_srcToken->position()+position,length,m_srcToken->status()); put(vertex_token,*m_graph,componentVertex,m_currentToken); m_currentData=new MorphoSyntacticData(); put(vertex_data,*m_graph,componentVertex,m_currentData); - + m_currentElement.inflectedForm=form; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp index 56aab114a..cdfd130c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp @@ -33,6 +33,7 @@ #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "linguisticProcessing/core/FlatTokenizer/CharChart.h" @@ -73,8 +74,8 @@ void DefaultProperties::init( std::deque skipUnmarkStatus; try { - string file=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("defaultPropertyFile"); - readDefaultsFromFile(file); + QString file = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), unitConfiguration.getParamsValueAtKey("defaultPropertyFile").c_str()); + readDefaultsFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -118,6 +119,7 @@ LimaStatusCode DefaultProperties::process( AnalysisContent& analysis) const { Lima::TimeUtilsController timer("DefaultProperties"); + MORPHOLOGINIT; AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); @@ -142,34 +144,29 @@ LimaStatusCode DefaultProperties::process( // orthographic alternatives, default properties are not applied> if (currentData->empty()) { - auto it = m_defaults.find(currentToken->status().defaultKey()); - if (it!=m_defaults.end()) - { + std::map >::const_iterator it=m_defaults.find(currentToken->status().defaultKey()); + if (it!=m_defaults.end()) { LinguisticElement elem; - elem.inflectedForm = currentToken->form(); - if (!currentToken->orthographicAlternatives().empty()) - { - elem.lemma = *(currentToken->orthographicAlternatives().begin()); - } - else if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()) - { - elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[currentToken->stringForm()]; + elem.inflectedForm=currentToken->form(); + LimaString str=currentToken->stringForm(); + if(m_skipUnmarkStatus.find(currentToken->status().defaultKey())==m_skipUnmarkStatus.end()){ + str = m_charChart->unmark(currentToken->stringForm()); } + elem.lemma= Common::MediaticData::MediaticData::changeable().stringsPool(m_language)[str]; elem.normalizedForm=elem.lemma; elem.type=UNKNOWN_WORD; - for (auto codeItr=it->second.begin(); codeItr!=it->second.end();codeItr++) + for (std::vector::const_iterator codeItr=it->second.begin(); + codeItr!=it->second.end(); + codeItr++) { elem.properties=*codeItr; currentData->push_back(elem); } - } - else - { - MORPHOLOGINIT; + } else { LWARN << "No default property for " - << currentToken->stringForm() << ". Status : " - << currentToken->status().defaultKey(); + << Common::Misc::limastring2utf8stdstring(currentToken->stringForm()) << ". Status : " + << Common::Misc::limastring2utf8stdstring(currentToken->status().defaultKey()); } } } @@ -193,7 +190,7 @@ void DefaultProperties::readDefaultsFromFile(const std::string& filename) string type; LinguisticCode props; while (fin.good() && !fin.eof()) { - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); if (line.size()>0) { istringstream is(line); is >> type; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp index 525b2855f..3bd18b633 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp @@ -28,6 +28,7 @@ #include "DesagglutinationResources.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include "linguisticProcessing/common/PropertyCode/PropertyManager.h" @@ -61,8 +62,8 @@ void DesagglutinationResources::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - string file=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("categoriesMappingFile"); - loadMicroCategoriesMappingFromFile(file); + QString file = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("categoriesMappingFile").c_str()); + loadMicroCategoriesMappingFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -71,8 +72,8 @@ void DesagglutinationResources::init( } try { - string file=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("delimiterFile"); - loadDelimitersFromFile(file); + QString file=Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("delimiterFile").c_str()); + loadDelimitersFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp index 393bb8d1a..44f213bef 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/EnchantSpellingAlternatives.cpp @@ -82,29 +82,14 @@ void EnchantSpellingAlternatives::init( Manager* manager) { MORPHOLOGINIT; - LDEBUG << "EnchantSpellingAlternatives::init"; m_d->m_language = manager->getInitializationParameters().media; try { - // By default, the spellchecking dictionary is the system one for the current language - std::string spellDico = Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language).substr(0,2); - try - { - // try to get a specific spellchecking dictionary name from the config file - spellDico = unitConfiguration.getParamsValueAtKey("spellcheckDictionary"); - } - catch (NoSuchParam& ) - { - } -// LDEBUG << "EnchantSpellingAlternatives::init requesting Enchant spellcheck dictionary" << Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/" << spellDico; -// enchant::Broker::instance()->set_param("enchant.myspell.dictionary.path",Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"); - LDEBUG << "EnchantSpellingAlternatives::init requesting Enchant spellcheck dictionary" << spellDico; - m_d->m_enchantDictionary = enchant::Broker::instance()->request_dict(spellDico); + m_d->m_enchantDictionary = enchant::Broker::instance()->request_dict(Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language).substr(0,2)); } catch (enchant::Exception& e) { - MORPHOLOGINIT; - LERROR << "Cannot get Enchant dictionary for language" << Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language)<< ":" << e.what(); + LERROR << "Cannot get Enchant dictionary for language" << Common::MediaticData::MediaticData::changeable().getMediaId(m_d->m_language); throw LimaException(); } try @@ -180,28 +165,26 @@ void EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives( // FIXME Conditions below could be process unit parameters if ( correction.size() > 1 && correction != tokenStr ) { - LDEBUG << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives trying to correct" << tokenStr << "into" << correction; - DictionaryEntry entry (m_dictionary->getEntry(correction)); + DictionaryEntry* entry = new DictionaryEntry(m_dictionary->getEntry(correction)); MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); -// if (!entry.isEmpty()) + if (!entry->isEmpty()) { LINFO << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correcting" << tokenStr << "into" << correction; // add orthographic alternative to Token; StringsPoolIndex idx=sp[correction]; token->addOrthographicAlternatives(idx); - if (entry.hasLingInfos()) + if (entry->hasLingInfos()) { - entry.parseLingInfos(&lingInfosHandler); + entry->parseLingInfos(&lingInfosHandler); } } -// else -// { -// LDEBUG << "EnchantSpellingAlternativesPrivate::setEnchantSpellingAlternatives correction" << correction << "not found in the dictionary"; -// delete entry; -// } + else + { + delete entry; + } } } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp index b61398322..0cf475de5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp @@ -1,342 +1,354 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/** - * @brief HyphenWordAlternatives is the module which creates split alternatives - * for hyphen word tokens. Each token from the supplied tokens path is processed : - * o FullToken must be "AlphaHyphen" typed by Tokenizer. - * o If a token has a single word entry or an orthographic alternative - * it is not decomposed - * o Token is break at hyphen boundaries and a new alternative path is created - * o each FullToken of the new Path is searched into dictionnary as Simple Word - * o If special hyphen entry, no alternatives are searched, - * otherwise Accented alternatives are searched - * o Path is valid even if not all FullToken have entry into dictionary - * @b - * Modified @date Dec, 02 2002 by GC to handle splitting on t_alpha_possessive - * - * @file HyphenWordAlternatives.cpp - * @author NAUTITIA jys - * @author Gael de Chalendar - * @author Copyright (c) 2002-2003 by CEA - * - * @date created on Nov, 30 2002 - * @version $Id$ - * - */ - -#include "HyphenWordAlternatives.h" -#include "MorphoSyntacticDataHandler.h" - -#include "common/Data/LimaString.h" -#include "common/Data/strwstrtools.h" -#include "common/MediaticData/mediaticData.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" -#include "linguisticProcessing/client/LinguisticProcessingException.h" -#include "common/time/timeUtilsController.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" -#include "common/MediaProcessors/MediaProcessors.h" -#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" - -using namespace std; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::AnnotationGraphs; -using namespace Lima::LinguisticProcessing::AnalysisDict; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace MorphologicAnalysis -{ - -SimpleFactory hyphenwordAlternativesFactory(HYPHENWORDALTERNATIVESFACTORY_CLASSID); - -HyphenWordAlternatives::HyphenWordAlternatives() -{} - -HyphenWordAlternatives::~HyphenWordAlternatives() -{} - -void HyphenWordAlternatives::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - MORPHOLOGINIT; - m_language = manager->getInitializationParameters().media; - try - { - string dico=unitConfiguration.getParamsValueAtKey("dictionary"); - AbstractResource* res=LinguisticResources::single().getResource(m_language,dico); - m_dictionary=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - string charchart=unitConfiguration.getParamsValueAtKey("charChart"); - AbstractResource* res=LinguisticResources::single().getResource(m_language,charchart); - m_charChart=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'charChart' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - string tok=unitConfiguration.getParamsValueAtKey("tokenizer"); - const MediaProcessUnit* res=manager->getObject(tok); - m_tokenizer=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - m_deleteHyphenWord=( unitConfiguration.getParamsValueAtKey("deleteHyphenWord") == "true"); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no param 'deleteHyphenWord' in HyphenAlternatives group for language " << (int) m_language; - LWARN << "use default value : true"; - m_deleteHyphenWord=true; - } - try - { - string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); - m_confidentMode=(confident=="true"); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no param 'confidentMode' in HyphenWordAlternatives group for language " << (int) m_language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); - m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,sp); -} - -LimaStatusCode HyphenWordAlternatives::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("HyphenWordAlternatives"); - MORPHOLOGINIT; - LINFO << "MorphologicalAnalysis: starting process HyphenWordAlternatives"; - - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - if (annotationData==0) - { - LDEBUG << "HyphenWordAlternatives::process: Misssing AnnotationData. Create it"; - annotationData = new AnnotationData(); - if (static_cast(analysis.getData("AnalysisGraph")) != 0) - { - static_cast(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph"); - } - analysis.setData("AnnotationData",annotationData); - } - - AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); - LinguisticGraph* graph=tokenList->getGraph(); - - VertexDataPropertyMap dataMap = get( vertex_data, *graph ); - VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); - - try - { - LinguisticGraphVertexIt it, it_end; - boost::tie(it, it_end) = vertices(*graph); - for (; it != it_end; it++) - { - MorphoSyntacticData* currentToken = dataMap[*it]; - Token* tok= tokenMap[*it]; - if (currentToken==0) continue; - // - if (currentToken->size() == 0) - { - if (tok->status().isAlphaHyphen()) - { - makeHyphenSplitAlternativeFor(*it, graph, annotationData); - } - } - } - } - catch (std::exception &exc) - { - MORPHOLOGINIT; - LWARN << "Exception in HyphenWordAlternatives : " << exc.what(); - return UNKNOWN_ERROR; - } - - LINFO << "MorphologicalAnalysis: ending process HyphenWordAlternatives"; - return SUCCESS_ID; -} - -void HyphenWordAlternatives::makeHyphenSplitAlternativeFor( - LinguisticGraphVertex splitted, - LinguisticGraph* graph, - AnnotationData* annotationData) const -{ - VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); - VertexDataPropertyMap dataMap = get( vertex_data, *graph ); - Token* currentToken = tokenMap[splitted]; - - // first, get a copy of token string - LimaString hyphenWord(currentToken->stringForm()); - // first replace hyphens by spaces - int pos = hyphenWord.indexOf(LimaChar(L'-'), 0); - while (pos != -1) - { - hyphenWord[(int)pos] = LimaChar(L' '); - pos = hyphenWord.indexOf(LimaChar(L'-'), pos+1); - } - // then submit string to Tokenizer - AnalysisContent toTokenize; - toTokenize.setData("Text",new LimaStringText(hyphenWord)); - LimaStatusCode status=m_tokenizer->process(toTokenize); - if (status != SUCCESS_ID) return; - AnalysisGraph* agTokenizer=static_cast(toTokenize.getData("AnalysisGraph")); - LinguisticGraph* tokgraph=agTokenizer->getGraph(); - - // setup position field - // insert each new FullToken into alternative path - uint64_t beginPos = currentToken->position()-1; - LinguisticGraphVertex previous = splitted; - LinguisticGraphVertex currentVx=agTokenizer->firstVertex(); - // go one step forward on the new path - { - LinguisticGraphAdjacencyIt adjItr,adjItrEnd; - boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); - if (adjItr==adjItrEnd) - { - MORPHOLOGINIT; - LERROR << "HypenWordAlternatives : no token forward !"; - throw LinguisticProcessingException(); - } - currentVx=*adjItr; - } - // LinguisticGraphVertex lastVx=agTokenizer->lastVertex(); - VertexTokenPropertyMap tokTokenMap=get(vertex_token,*tokgraph); - Token* tokenizerToken=tokTokenMap[currentVx]; - - bool isFirst=true; - - while (tokenizerToken) - { - // prepare the new vertex - Token* newFT=new Token(*tokenizerToken); - newFT->status().setAlphaHyphen( true ); - MorphoSyntacticData* newData=new MorphoSyntacticData(); - LinguisticGraphVertex newVertex = add_vertex(*graph); - - AnnotationGraphVertex agv = annotationData->createAnnotationVertex(); - annotationData->addMatching("AnalysisGraph", newVertex, "annot", agv); - annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), newVertex); - - - tokenMap[newVertex]=newFT; - dataMap[newVertex]=newData; - newFT-> setPosition(newFT->position() + beginPos); - const LimaString& newTokenStr=newFT->stringForm(); - MorphoSyntacticDataHandler handler(*newData,HYPHEN_ALTERNATIVE); - - if (isFirst) - { - LimaString newTokHyphen(newTokenStr); - newTokHyphen.append(LimaChar('-')); - DictionaryEntry dicoEntry(m_dictionary->getEntry(newTokHyphen)); - if (!dicoEntry.isEmpty() && dicoEntry.hasLingInfos()) - { - dicoEntry.parseLingInfos(&handler); - } else { - m_reader->readAlternatives( - *newFT, - *m_dictionary, - &handler, - 0, - &handler); - } - } - else - { - m_reader->readAlternatives( - *newFT, - *m_dictionary, - &handler, - 0, - &handler); - } - - // links the new vertex to its predecessor in the graph - if (previous == splitted) - { - LinguisticGraphInEdgeIt ite, ite_end; - boost::tie(ite, ite_end) = in_edges(splitted, *graph); - for (; ite != ite_end; ite++) - { - add_edge(source(*ite,*graph), newVertex, *graph); - } - } - else - { - add_edge(previous, newVertex, *graph); - } - previous = newVertex; - // go one step forward on the new path - LinguisticGraphAdjacencyIt adjItr,adjItrEnd; - boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); - if (adjItr==adjItrEnd) - { - MORPHOLOGINIT; - LERROR << "HypenWordAlternatives : no token forward !"; - throw LinguisticProcessingException(); - } - currentVx=*adjItr; - tokenizerToken=tokTokenMap[currentVx]; - } - - // links the last new vertex created to the successors of the splitted vertex - LinguisticGraphOutEdgeIt ite, ite_end; - boost::tie(ite, ite_end) = out_edges(splitted, *graph); - for (; ite != ite_end; ite++) - { - add_edge(previous, target(*ite,*graph), *graph); - } - - // if have to delete hyphen word, then clear it in the graph - if (m_deleteHyphenWord) - { - clear_vertex(splitted,*graph); - } -} - -} // closing namespace MorphologicAnalysis -} // closing namespace LinguisticProcessing -} // closing namespace Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/** + * @brief HyphenWordAlternatives is the module which creates split alternatives + * for hyphen word tokens. Each token from the supplied tokens path is processed : + * o FullToken must be "AlphaHyphen" typed by Tokenizer. + * o If a token has a single word entry or an orthographic alternative + * it is not decomposed + * o Token is break at hyphen boundaries and a new alternative path is created + * o each FullToken of the new Path is searched into dictionnary as Simple Word + * o If special hyphen entry, no alternatives are searched, + * otherwise Accented alternatives are searched + * o Path is valid even if not all FullToken have entry into dictionary + * @b + * Modified @date Dec, 02 2002 by GC to handle splitting on t_alpha_possessive + * + * @file HyphenWordAlternatives.cpp + * @author NAUTITIA jys + * @author Gael de Chalendar + * @author Copyright (c) 2002-2003 by CEA + * + * @date created on Nov, 30 2002 + * @version $Id$ + * + */ + +#include "HyphenWordAlternatives.h" +#include "MorphoSyntacticDataHandler.h" + +#include "common/Data/LimaString.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/client/LinguisticProcessingException.h" +#include "common/time/timeUtilsController.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "common/MediaProcessors/MediaProcessors.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +using namespace std; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::AnalysisDict; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + +SimpleFactory hyphenwordAlternativesFactory(HYPHENWORDALTERNATIVESFACTORY_CLASSID); + +HyphenWordAlternatives::HyphenWordAlternatives() +{} + +HyphenWordAlternatives::~HyphenWordAlternatives() +{ + delete m_reader; +} + +void HyphenWordAlternatives::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + MORPHOLOGINIT; + m_language = manager->getInitializationParameters().media; + try + { + string dico=unitConfiguration.getParamsValueAtKey("dictionary"); + AbstractResource* res=LinguisticResources::single().getResource(m_language,dico); + m_dictionary=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + string charchart=unitConfiguration.getParamsValueAtKey("charChart"); + AbstractResource* res=LinguisticResources::single().getResource(m_language,charchart); + m_charChart=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'charChart' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + string tok=unitConfiguration.getParamsValueAtKey("tokenizer"); + const MediaProcessUnit* res=manager->getObject(tok); + m_tokenizer=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + m_deleteHyphenWord=( unitConfiguration.getParamsValueAtKey("deleteHyphenWord") == "true"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'deleteHyphenWord' in HyphenAlternatives group for language " << (int) m_language; + LWARN << "use default value : true"; + m_deleteHyphenWord=true; + } + try + { + string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); + m_confidentMode=(confident=="true"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in HyphenWordAlternatives group for language " << (int) m_language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); + m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,sp); +} + +LimaStatusCode HyphenWordAlternatives::process( + AnalysisContent& analysis) const +{ + Lima::TimeUtilsController timer("HyphenWordAlternatives"); + MORPHOLOGINIT; + LINFO << "MorphologicalAnalysis: starting process HyphenWordAlternatives"; + +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) + { + LDEBUG << "HyphenWordAlternatives::process: Misssing AnnotationData. Create it"; + annotationData = new AnnotationData(); + if (static_cast(analysis.getData("AnalysisGraph")) != 0) + { + static_cast(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph"); + } + analysis.setData("AnnotationData",annotationData); + } + + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + LinguisticGraph* graph=tokenList->getGraph(); + + VertexDataPropertyMap dataMap = get( vertex_data, *graph ); + VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); + + try + { + LinguisticGraphVertexIt it, it_end; + boost::tie(it, it_end) = vertices(*graph); + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in HyphenWordAlternatives"; + return TIME_OVERFLOW; + } +#endif + MorphoSyntacticData* currentToken = dataMap[*it]; + Token* tok= tokenMap[*it]; + if (currentToken==0) continue; + // + if (currentToken->size() == 0) + { + if (tok->status().isAlphaHyphen()) + { + makeHyphenSplitAlternativeFor(*it, graph, annotationData); + } + } + } + } + catch (std::exception &exc) + { + MORPHOLOGINIT; + LWARN << "Exception in HyphenWordAlternatives : " << exc.what(); + return UNKNOWN_ERROR; + } + + LINFO << "MorphologicalAnalysis: ending process HyphenWordAlternatives"; + return SUCCESS_ID; +} + +void HyphenWordAlternatives::makeHyphenSplitAlternativeFor( + LinguisticGraphVertex splitted, + LinguisticGraph* graph, + AnnotationData* annotationData) const +{ + VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); + VertexDataPropertyMap dataMap = get( vertex_data, *graph ); + Token* currentToken = tokenMap[splitted]; + + // first, get a copy of token string + LimaString hyphenWord(currentToken->stringForm()); + // first replace hyphens by spaces + int pos = hyphenWord.indexOf(LimaChar(L'-'), 0); + while (pos != -1) + { + hyphenWord[(int)pos] = LimaChar(L' '); + pos = hyphenWord.indexOf(LimaChar(L'-'), pos+1); + } + // then submit string to Tokenizer + AnalysisContent toTokenize; + toTokenize.setData("Text",new LimaStringText(hyphenWord)); + LimaStatusCode status=m_tokenizer->process(toTokenize); + if (status != SUCCESS_ID) return; + AnalysisGraph* agTokenizer=static_cast(toTokenize.getData("AnalysisGraph")); + LinguisticGraph* tokgraph=agTokenizer->getGraph(); + + // setup position field + // insert each new FullToken into alternative path + uint64_t beginPos = currentToken->position()-1; + LinguisticGraphVertex previous = splitted; + LinguisticGraphVertex currentVx=agTokenizer->firstVertex(); + // go one step forward on the new path + { + LinguisticGraphAdjacencyIt adjItr,adjItrEnd; + boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); + if (adjItr==adjItrEnd) + { + MORPHOLOGINIT; + LERROR << "HypenWordAlternatives : no token forward !"; + throw LinguisticProcessingException(); + } + currentVx=*adjItr; + } + // LinguisticGraphVertex lastVx=agTokenizer->lastVertex(); + VertexTokenPropertyMap tokTokenMap=get(vertex_token,*tokgraph); + Token* tokenizerToken=tokTokenMap[currentVx]; + + bool isFirst=true; + + while (tokenizerToken) + { + // prepare the new vertex + Token* newFT=new Token(*tokenizerToken); + newFT->status().setAlphaHyphen( true ); + MorphoSyntacticData* newData=new MorphoSyntacticData(); + LinguisticGraphVertex newVertex = add_vertex(*graph); + + AnnotationGraphVertex agv = annotationData->createAnnotationVertex(); + annotationData->addMatching("AnalysisGraph", newVertex, "annot", agv); + annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), newVertex); + + + tokenMap[newVertex]=newFT; + dataMap[newVertex]=newData; + newFT-> setPosition(newFT->position() + beginPos); + const LimaString& newTokenStr=newFT->stringForm(); + MorphoSyntacticDataHandler handler(*newData,HYPHEN_ALTERNATIVE); + + if (isFirst) + { + LimaString newTokHyphen(newTokenStr); + newTokHyphen.append(LimaChar('-')); + DictionaryEntry dicoEntry(m_dictionary->getEntry(newTokHyphen)); + if (!dicoEntry.isEmpty() && dicoEntry.hasLingInfos()) + { + dicoEntry.parseLingInfos(&handler); + } else { + m_reader->readAlternatives( + *newFT, + *m_dictionary, + &handler, + 0, + &handler); + } + } + else + { + m_reader->readAlternatives( + *newFT, + *m_dictionary, + &handler, + 0, + &handler); + } + + // links the new vertex to its predecessor in the graph + if (previous == splitted) + { + LinguisticGraphInEdgeIt ite, ite_end; + boost::tie(ite, ite_end) = in_edges(splitted, *graph); + for (; ite != ite_end; ite++) + { + add_edge(source(*ite,*graph), newVertex, *graph); + } + } + else + { + add_edge(previous, newVertex, *graph); + } + previous = newVertex; + // go one step forward on the new path + LinguisticGraphAdjacencyIt adjItr,adjItrEnd; + boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); + if (adjItr==adjItrEnd) + { + MORPHOLOGINIT; + LERROR << "HypenWordAlternatives : no token forward !"; + throw LinguisticProcessingException(); + } + currentVx=*adjItr; + tokenizerToken=tokTokenMap[currentVx]; + } + + // links the last new vertex created to the successors of the splitted vertex + LinguisticGraphOutEdgeIt ite, ite_end; + boost::tie(ite, ite_end) = out_edges(splitted, *graph); + for (; ite != ite_end; ite++) + { + add_edge(previous, target(*ite,*graph), *graph); + } + + // if have to delete hyphen word, then clear it in the graph + if (m_deleteHyphenWord) + { + clear_vertex(splitted,*graph); + } +} + +} // closing namespace MorphologicAnalysis +} // closing namespace LinguisticProcessing +} // closing namespace Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp index fbeeb44c0..d58dbe53b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp @@ -127,13 +127,13 @@ LimaStatusCode OrthographicAlternatives::process( for (;it!=itEnd;it++) { LDEBUG << "processing vertex " << *it; - MorphoSyntacticData* currentToken=dataMap[*it]; + MorphoSyntacticData* currentTokenData=dataMap[*it]; Token* tok=tokenMap[*it]; - if (currentToken!=0) + if (currentTokenData!=0) { // if in confidentMode and token has already ling infos, skip - if ( m_confidentMode && (currentToken->size()>0) ) continue; + if ( m_confidentMode && (currentTokenData->size()>0) ) continue; // set orthographic alternatives given by dictionary // using the alternatives directly given by the morphosyntactic data @@ -145,20 +145,20 @@ LimaStatusCode OrthographicAlternatives::process( LimaString oa = entry->nextAccented(); while ( oa.size() > 0 ) { - createAlternative(tok,currentToken,oa,m_dictionary,sp); + createAlternative(tok,currentTokenData,oa,m_dictionary,sp); oa = entry->nextAccented(); } } } // if in confidentMode and token has already ling infos, skip - if (m_confidentMode && (currentToken->size() > 0) ) continue; + if (m_confidentMode && (currentTokenData->size() > 0) ) continue; // if no ling infos, then lower and unmark string LDEBUG << "set unmark alternatives"; setOrthographicAlternatives( tok, - currentToken, + currentTokenData, m_dictionary, m_charChart, sp); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp index f428c8dc0..9ab1424e0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp @@ -1,271 +1,278 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ - -// NAUTITIA -// -// jys 8-OCT-2002 -// -// SimpleWord is the implementation of the 1st module of -// Morphological Analysis. Each token from the main tokens -// path is searched into the specified dictionary. - - -#include "SimpleWord.h" - -#include "linguisticProcessing/LinguisticProcessingCommon.h" -#include "common/MediaticData/mediaticData.h" -#include "common/time/timeUtilsController.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/core/AnalysisDict/AbstractDictionaryEntry.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/misc/fsaStringsPool.h" -#include "MorphoSyntacticDataHandler.h" -#include "ConcatenatedDataHandler.h" -#include "AccentedConcatenatedDataHandler.h" -#include "SequenceEntryHandler.h" - -#include - -using namespace std; -using namespace Lima::Common::AnnotationGraphs; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::AnalysisDict; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace MorphologicAnalysis -{ - -SimpleFactory SimpleWordFactory(SIMPLEWORD_CLASSID); - - -SimpleWord::SimpleWord() : - m_reader(0) -{} - -SimpleWord::~SimpleWord() -{ - if (m_reader==0) - { - delete m_reader; - } -} - -void SimpleWord::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) -{ - MORPHOLOGINIT; - MediaId language = manager->getInitializationParameters().media; - m_sp=&Common::MediaticData::MediaticData::changeable().stringsPool(language); - string dico; - try - { - dico=unitConfiguration.getParamsValueAtKey("dictionary"); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'dictionary' in SimpleWord group for language " << (int) language; - throw InvalidConfiguration(); - } - - AbstractResource* res=LinguisticResources::single().getResource(language,dico); - m_dictionary=static_cast(res); - - try - { - string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); - LDEBUG << "SimpleWord set confident mode to:" << confident; - m_confidentMode=(confident=="true"); - } - catch (NoSuchParam& ) - { - LWARN << "no param 'confidentMode' in SimpleWord group for language " << (int) language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - - // initialize dictionary reader - - try - { - string chart=unitConfiguration.getParamsValueAtKey("charChart"); - AbstractResource* res= LinguisticResources::single().getResource(language,chart); - m_charChart=static_cast(res); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'charChart' in SimpleWord group for language " << (int) language; - throw InvalidConfiguration(); - } - - m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,&Common::MediaticData::MediaticData::changeable().stringsPool(language)); - - try - { - string concat=unitConfiguration.getParamsValueAtKey("parseConcatenated"); - m_parseConcatenated=(concat=="true"); - } - catch (NoSuchParam& ) - { - LWARN << "no param 'parseConcatenated' in SimpleWord group for language " << (int) language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - -} - - -LimaStatusCode SimpleWord::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("SimpleWord"); - MORPHOLOGINIT; - LINFO << "starting process SimpleWord"; - - AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); - - - LinguisticGraph* g=tokenList->getGraph(); - LinguisticGraphVertexIt it,itEnd; - VertexTokenPropertyMap tokenMap=get(vertex_token,*g); - VertexDataPropertyMap dataMap=get(vertex_data,*g); - boost::tie(it,itEnd)=vertices(*g); - for (;it!=itEnd;it++) - { - Token* currentToken=tokenMap[*it]; - if (currentToken!=0) - { -#ifdef DEBUG_LP - LDEBUG << "SimpleWord for token" << currentToken->stringForm(); -#endif - // Init handlers - MorphoSyntacticData* msd=dataMap[*it]; - AbstractDictionaryEntryHandler* lingInfoHandler=new MorphoSyntacticDataHandler(*msd,SIMPLE_WORD); - ConcatenatedDataHandler* concatHandler=0; - AccentedConcatenatedDataHandler* accentedConcatHandler=0; - AbstractDictionaryEntryHandler* accentedHandler=lingInfoHandler; - - if (m_parseConcatenated) { - concatHandler=new ConcatenatedDataHandler(g,currentToken,SIMPLE_WORD,m_sp); - accentedConcatHandler=new AccentedConcatenatedDataHandler( - g, - currentToken->stringForm(), - currentToken->position(), - currentToken->status(), - SIMPLE_WORD, - m_sp, - m_charChart); - SequenceEntryHandler* seh=new SequenceEntryHandler(); - seh->addHandler(lingInfoHandler); - seh->addHandler(accentedConcatHandler); - accentedHandler=seh; - } - - // parse data - DictionaryEntry entry(m_dictionary->getEntry(currentToken->form(),currentToken->stringForm())); - m_reader->readAlternatives( - *currentToken, - *m_dictionary, - lingInfoHandler, - concatHandler, - accentedHandler); - - // finalize - if (concatHandler && !concatHandler->getConcatVertices().empty()) { - linkConcatVertices(g,*it,concatHandler->getConcatVertices()); - if (msd->empty()) { - clear_vertex(*it,*g); - } - } - if (accentedConcatHandler && !accentedConcatHandler->getConcatVertices().empty()) { - linkConcatVertices(g,*it,accentedConcatHandler->getConcatVertices()); - if (msd->empty()) { - clear_vertex(*it,*g); - } - } - if (m_parseConcatenated) { - delete concatHandler; - delete accentedConcatHandler; - delete accentedHandler; - } - delete lingInfoHandler; - - } - } - - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - if (annotationData==0) - { - LINFO << "SimpleWord::process no annotation data, creating and populating it"; - annotationData=new AnnotationData(); - analysis.setData("AnnotationData",annotationData); - } - tokenList->populateAnnotationGraph(annotationData, "AnalysisGraph"); - if (static_cast(analysis.getData("PosGraph")) != 0) - { - static_cast(analysis.getData("PosGraph"))->populateAnnotationGraph(annotationData, "PosGraph"); - } - -#ifdef DEBUG_LP - LDEBUG << "ending process SimpleWord"; -#endif - return SUCCESS_ID; -} - -void SimpleWord::linkConcatVertices( - LinguisticGraph* graph, - LinguisticGraphVertex srcToken, - const std::vector >& concats) const -{ - LinguisticGraphInEdgeIt ieItr,ieItrEnd; - for (boost::tie(ieItr,ieItrEnd) = in_edges(srcToken,*graph); - ieItr!=ieItrEnd; - ieItr++) - { - LinguisticGraphVertex pred=source(*ieItr,*graph); - LinguisticGraphOutEdgeIt oeItr,oeItrEnd; - for (boost::tie(oeItr,oeItrEnd) = out_edges(srcToken,*graph); - oeItr!=oeItrEnd; - oeItr++) - { - LinguisticGraphVertex next=target(*oeItr,*graph); - for (std::vector >::const_iterator concatItr=concats.begin(); - concatItr!=concats.end(); - concatItr++) - { - add_edge(pred,concatItr->front(),*graph); - add_edge(concatItr->back(),next,*graph); - } - } - } -} - - -} // MorphologicAnalysis -} // LinguisticProcessing -} // Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +// NAUTITIA +// +// jys 8-OCT-2002 +// +// SimpleWord is the implementation of the 1st module of +// Morphological Analysis. Each token from the main tokens +// path is searched into the specified dictionary. + + +#include "SimpleWord.h" + +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/AnalysisDict/AbstractDictionaryEntry.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/misc/fsaStringsPool.h" +#include "MorphoSyntacticDataHandler.h" +#include "ConcatenatedDataHandler.h" +#include "AccentedConcatenatedDataHandler.h" +#include "SequenceEntryHandler.h" + +#include + +using namespace std; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::AnalysisDict; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + +SimpleFactory SimpleWordFactory(SIMPLEWORD_CLASSID); + + +SimpleWord::SimpleWord() : + m_reader(0) +{} + +SimpleWord::~SimpleWord() +{ + delete m_reader; +} + +void SimpleWord::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + MORPHOLOGINIT; + MediaId language = manager->getInitializationParameters().media; + m_sp=&Common::MediaticData::MediaticData::changeable().stringsPool(language); + string dico; + try + { + dico=unitConfiguration.getParamsValueAtKey("dictionary"); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'dictionary' in SimpleWord group for language " << (int) language; + throw InvalidConfiguration(); + } + + AbstractResource* res=LinguisticResources::single().getResource(language,dico); + m_dictionary=static_cast(res); + + try + { + string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); + LDEBUG << "SimpleWord set confident mode to:" << confident; + m_confidentMode=(confident=="true"); + } + catch (NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in SimpleWord group for language " << (int) language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + + // initialize dictionary reader + + try + { + string chart=unitConfiguration.getParamsValueAtKey("charChart"); + AbstractResource* res= LinguisticResources::single().getResource(language,chart); + m_charChart=static_cast(res); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'charChart' in SimpleWord group for language " << (int) language; + throw InvalidConfiguration(); + } + + m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,&Common::MediaticData::MediaticData::changeable().stringsPool(language)); + + try + { + string concat=unitConfiguration.getParamsValueAtKey("parseConcatenated"); + m_parseConcatenated=(concat=="true"); + } + catch (NoSuchParam& ) + { + LWARN << "no param 'parseConcatenated' in SimpleWord group for language " << (int) language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + +} + + +LimaStatusCode SimpleWord::process( + AnalysisContent& analysis) const +{ + Lima::TimeUtilsController timer("SimpleWord"); + MORPHOLOGINIT; + LINFO << "starting process SimpleWord"; + +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + + + LinguisticGraph* g=tokenList->getGraph(); + LinguisticGraphVertexIt it,itEnd; + VertexTokenPropertyMap tokenMap=get(vertex_token,*g); + VertexDataPropertyMap dataMap=get(vertex_data,*g); + boost::tie(it,itEnd)=vertices(*g); + for (;it!=itEnd;it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SimpleWord"; + return TIME_OVERFLOW; + } +#endif + Token* currentToken=tokenMap[*it]; + if (currentToken!=0) + { +#ifdef DEBUG_LP + LDEBUG << "SimpleWord for token" << currentToken->stringForm(); +#endif + // Init handlers + MorphoSyntacticData* msd=dataMap[*it]; + AbstractDictionaryEntryHandler* lingInfoHandler=new MorphoSyntacticDataHandler(*msd,SIMPLE_WORD); + ConcatenatedDataHandler* concatHandler=0; + AccentedConcatenatedDataHandler* accentedConcatHandler=0; + AbstractDictionaryEntryHandler* accentedHandler=lingInfoHandler; + + if (m_parseConcatenated) { + concatHandler=new ConcatenatedDataHandler(g,currentToken,SIMPLE_WORD,m_sp); + accentedConcatHandler=new AccentedConcatenatedDataHandler( + g, + currentToken->stringForm(), + currentToken->position(), + currentToken->status(), + SIMPLE_WORD, + m_sp, + m_charChart); + SequenceEntryHandler* seh=new SequenceEntryHandler(); + seh->addHandler(lingInfoHandler); + seh->addHandler(accentedConcatHandler); + accentedHandler=seh; + } + + // parse data + DictionaryEntry entry(m_dictionary->getEntry(currentToken->form(),currentToken->stringForm())); + m_reader->readAlternatives( + *currentToken, + *m_dictionary, + lingInfoHandler, + concatHandler, + accentedHandler); + + // finalize + if (concatHandler && !concatHandler->getConcatVertices().empty()) { + linkConcatVertices(g,*it,concatHandler->getConcatVertices()); + if (msd->empty()) { + clear_vertex(*it,*g); + } + } + if (accentedConcatHandler && !accentedConcatHandler->getConcatVertices().empty()) { + linkConcatVertices(g,*it,accentedConcatHandler->getConcatVertices()); + if (msd->empty()) { + clear_vertex(*it,*g); + } + } + if (m_parseConcatenated) { + delete concatHandler; + delete accentedConcatHandler; + delete accentedHandler; + } + delete lingInfoHandler; + + } + } + + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) + { + LINFO << "SimpleWord::process no annotation data, creating and populating it"; + annotationData=new AnnotationData(); + analysis.setData("AnnotationData",annotationData); + } + tokenList->populateAnnotationGraph(annotationData, "AnalysisGraph"); + if (static_cast(analysis.getData("PosGraph")) != 0) + { + static_cast(analysis.getData("PosGraph"))->populateAnnotationGraph(annotationData, "PosGraph"); + } + +#ifdef DEBUG_LP + LDEBUG << "ending process SimpleWord"; +#endif + return SUCCESS_ID; +} + +void SimpleWord::linkConcatVertices( + LinguisticGraph* graph, + LinguisticGraphVertex srcToken, + const std::vector >& concats) const +{ + LinguisticGraphInEdgeIt ieItr,ieItrEnd; + for (boost::tie(ieItr,ieItrEnd) = in_edges(srcToken,*graph); + ieItr!=ieItrEnd; + ieItr++) + { + LinguisticGraphVertex pred=source(*ieItr,*graph); + LinguisticGraphOutEdgeIt oeItr,oeItrEnd; + for (boost::tie(oeItr,oeItrEnd) = out_edges(srcToken,*graph); + oeItr!=oeItrEnd; + oeItr++) + { + LinguisticGraphVertex next=target(*oeItr,*graph); + for (std::vector >::const_iterator concatItr=concats.begin(); + concatItr!=concats.end(); + concatItr++) + { + add_edge(pred,concatItr->front(),*graph); + add_edge(concatItr->back(),next,*graph); + } + } + } +} + + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp index 820002935..4ed337397 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp @@ -25,6 +25,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/time/traceUtils.h" #include "linguisticProcessing/common/annotationGraph/AnnotationData.h" @@ -46,7 +47,9 @@ #include #include #include +#ifdef ANTINNO_SPECIFIC #include +#endif #include // LDBL_MIN/MAX #include // log @@ -148,7 +151,7 @@ void DynamicSvmToolPosTagger::init( // Creates the tagger we use erCompRegExp(); - t = new tagger(resourcesPath + "/" + model); + t = new tagger(Common::Misc::findFileInPaths(resourcesPath.c_str(), model.c_str()).toUtf8().constData()); t->taggerLoadModelsForTagging(); t->taggerShowComments(); t->taggerActiveShowScoresFlag(); @@ -195,8 +198,13 @@ LimaStatusCode DynamicSvmToolPosTagger::process(AnalysisContent& analysis) const std::map maxAncestor; /* Push every vertex coming from vertex 0 onto the "tokens to be visited" list */ +#ifdef ANTINNO_SPECIFIC BOOST_FOREACH(LinguisticGraphVertex vertex, - nextTokens(analysisGraph->firstVertex(), srcGraph)) + nextTokens(analysisGraph->firstVertex(), srcGraph)) +#else + for(LinguisticGraphVertex vertex: + nextTokens(analysisGraph->firstVertex(), srcGraph)) +#endif { tokenQueue.push(vertex); } @@ -218,7 +226,11 @@ LimaStatusCode DynamicSvmToolPosTagger::process(AnalysisContent& analysis) const /* For every ancestor of our node */ std::set previousTokens = getPreviousTokens(vertex, srcGraph); if(previousTokens.empty()) previousTokens.insert(posGraph->firstVertex()); +#ifdef ANTINNO_SPECIFIC BOOST_FOREACH(LinguisticGraphVertex prevVertex, previousTokens) { +#else + for(LinguisticGraphVertex prevVertex: previousTokens) { +#endif std::string pos = ""; double logCurWeight = log(1.0), w; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp index b8871d1e4..a518b7804 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp @@ -27,14 +27,12 @@ #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/time/timeUtilsController.h" #include "svmtool/tagger.h" -#include -#include -#include - +#include int verbose = FALSE; @@ -89,7 +87,10 @@ void SvmToolPosTagger::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - m_model = resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("model"); + string modelName=unitConfiguration.getParamsValueAtKey("model"); + // add .DICT to find the file, remove it to get the generic model name + path + m_model = Common::Misc::findFileInPaths(resourcesPath.c_str(), modelName.append(".DICT").c_str()).toUtf8().constData(); + boost::replace_last(m_model,".DICT",""); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp index 26e64e006..58e3c856d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp @@ -24,6 +24,7 @@ #include "ViterbiPosTagger.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "common/tools/FileUtils.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "integerCost.h" @@ -38,7 +39,7 @@ namespace LinguisticProcessing namespace PosTagger { -ViterbiPosTaggerFactory* ViterbiPosTaggerFactory::s_instance=new ViterbiPosTaggerFactory(VITERBIPOSTAGGER_CLASSID); +std::unique_ptr< ViterbiPosTaggerFactory > ViterbiPosTaggerFactory::s_instance=std::unique_ptr< ViterbiPosTaggerFactory >(new ViterbiPosTaggerFactory(VITERBIPOSTAGGER_CLASSID)); ViterbiPosTaggerFactory::ViterbiPosTaggerFactory(const std::string& id) : @@ -73,8 +74,8 @@ MediaProcessUnit* ViterbiPosTaggerFactory::create( string resourcesPath=MediaticData::single().getResourcesPath(); try { - trigramsFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("trigramFile"); - bigramsFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("bigramFile"); + trigramsFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("trigramFile").c_str()).toUtf8().constData(); + bigramsFile=Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("bigramFile").c_str()).toUtf8().constData(); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h index 696efca31..68fcf255f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h @@ -199,7 +199,7 @@ class LIMA_POSTAGGER_EXPORT ViterbiPosTaggerFactory : public InitializableObject private: ViterbiPosTaggerFactory(const std::string& id); - static ViterbiPosTaggerFactory* s_instance; + static std::unique_ptr< ViterbiPosTaggerFactory > s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp index 6fdc50576..ab97892fe 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp @@ -26,6 +26,7 @@ */ #include "ngramMatrices.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/common/linguisticData/languageData.h" @@ -68,7 +69,7 @@ void TrigramMatrix::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - string trigramFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("trigramFile"); + string trigramFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("trigramFile").c_str()).toUtf8().constData(); readTrigramMatrixFile(trigramFile); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) @@ -96,9 +97,8 @@ void TrigramMatrix::readTrigramMatrixFile(const std::string& fileName) boost::regex linere("^(.+)\t(.+)\t(.+)\t(\\d+(\\.\\d+)?)$"); boost::regex numre("^\\d+$"); - std::string lineString; + std::string lineString = Lima::Common::Misc::readLine(ifl); size_t linenum(0); - getline(ifl, lineString); while (ifl.good() && !ifl.eof()) { Common::Misc::chomp(lineString); @@ -140,7 +140,7 @@ void TrigramMatrix::readTrigramMatrixFile(const std::string& fileName) // LDEBUG << "Got trigram: ["<& regexes = unitConfiguration.getMapAtKey("regexes"); for (std::map ::const_iterator it = regexes.begin(); it != regexes.end(); it++) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp index b415504b4..de0b8e766 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp @@ -1,4 +1,683 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + +/* + Copyright 2002-2014 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "ConllDumper.h" +#include "common/MediaProcessors/DumperStream.h" +#include "common/time/traceUtils.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "linguisticProcessing/core/SyntacticAnalysis/SyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticDataUtils.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" +#include "common/misc/AbstractAccessByString.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/relation.h" +#include "linguisticProcessing/core/SemanticAnalysis/LimaConllTokenIdMapping.h" + +#include +#include +#include + +#include + +using namespace Lima::Common; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; +using namespace Lima::LinguisticProcessing::SemanticAnalysis; +using namespace Lima::LinguisticProcessing::SyntacticAnalysis; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ + +namespace LinguisticProcessing +{ + +namespace AnalysisDumpers +{ + +SimpleFactory conllDumperFactory(CONLLDUMPER_CLASSID); + +class ConllDumperPrivate +{ + friend class ConllDumper; + ConllDumperPrivate(); + + virtual ~ConllDumperPrivate(); + + /** + * @brief Collect all annotation tokens corresponding to a predicate of the + * sentence starting at @ref sentenceBegin and finishing at @ref sentenceEnd + */ + QMultiMap collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd); + + MediaId m_language; + std::string m_property; + const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; + const Common::PropertyCode::PropertyManager* m_propertyManager; + const Common::PropertyCode::PropertyManager* m_timeManager; //Ajout + const Common::PropertyCode::PropertyAccessor* m_timeAccessor; //Ajout + + std::string m_graph; + std::string m_sep; + std::string m_sepPOS; + std::string m_verbTenseFlag; //Ajout + QMap m_conllLimaDepMapping; + std::string m_suffix; +}; + + +ConllDumperPrivate::ConllDumperPrivate(): +m_language(0), +m_property("MICRO"), +m_propertyAccessor(0), +m_propertyManager(0), +m_graph("PosGraph"), +m_sep(" "), +m_sepPOS("#"), +m_conllLimaDepMapping(), +m_suffix(".conll") +{ +} + +ConllDumperPrivate::~ConllDumperPrivate() +{} + +ConllDumper::ConllDumper(): +AbstractTextualAnalysisDumper(), +m_d(new ConllDumperPrivate()) +{ +} + +ConllDumper::~ConllDumper() +{ + delete m_d; +} + +void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + DUMPERLOGINIT; + AbstractTextualAnalysisDumper::init(unitConfiguration,manager); + m_d->m_language=manager->getInitializationParameters().media; + try + { + m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (NoSuchParam& ) {} // keep default value + const Common::PropertyCode::PropertyCodeManager& codeManager=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager(); + m_d->m_propertyAccessor=&codeManager.getPropertyAccessor("MICRO"); + + try + { + m_d->m_verbTenseFlag=unitConfiguration.getParamsValueAtKey("verbTenseFlag"); + } + catch (NoSuchParam& ) { + m_d->m_verbTenseFlag=std::string("False"); + } // keep default value + + try + { + m_d->m_sep=unitConfiguration.getParamsValueAtKey("sep"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_sepPOS=unitConfiguration.getParamsValueAtKey("sepPOS"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_property=unitConfiguration.getParamsValueAtKey("property"); + } + catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); + } + catch (NoSuchParam& ) {} // keep default value + m_d->m_propertyManager=&codeManager.getPropertyManager(m_d->m_property); + + m_d->m_timeManager=&codeManager.getPropertyManager("TIME"); + m_d->m_timeAccessor=&codeManager.getPropertyAccessor("TIME"); + + try { + std::string resourcePath = Common::MediaticData::MediaticData::single().getResourcesPath(); + std::string mappingFile = resourcePath + "/" + unitConfiguration.getParamsValueAtKey("mappingFile"); + std::ifstream ifs(mappingFile, std::ifstream::binary); + if (!ifs.good()) + { + LERROR << "ERROR: cannot open"+ mappingFile; + throw InvalidConfiguration(); + } + while (ifs.good() && !ifs.eof()) + { + std::string line = Lima::Common::Misc::readLine(ifs); + QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); + if (strs.size() == 2) + { + m_d->m_conllLimaDepMapping.insert(strs[0],strs[1]); + } + } + + } catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LINFO << "no parameter 'mappingFile' in ConllDumper group" << " !"; +// throw InvalidConfiguration(); + } +} + +LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; + LDEBUG << "ConllDumper::process"; +#endif + + LinguisticMetaData* metadata = static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + if (annotationData == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no AnnotationData ! abort"; + return MISSING_DATA; + } + AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph));//est de type PosGraph et non pas AnalysisGraph + if (tokenList==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process graph " << m_d->m_graph << " has not been produced: check pipeline"; + return MISSING_DATA; + } + LinguisticGraph* graph=tokenList->getGraph(); + SegmentationData* sd=static_cast(analysis.getData("SentenceBoundaries")); + if (sd==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no SentenceBoundaries! abort"; + return MISSING_DATA; + } + + SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); + if (syntacticData==0) + { + syntacticData=new SyntacticData(tokenList,0); + syntacticData->setupDependencyGraph(); + analysis.setData("SyntacticData",syntacticData); + } + const DependencyGraph* depGraph = syntacticData-> dependencyGraph(); + + QScopedPointer dstream(initialize(analysis)); + + std::map< LinguisticGraphVertex, std::pair > vertexDependencyInformations; + + uint64_t nbSentences((sd->getSegments()).size()); + if (nbSentences == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process 0 sentence to process"; + return SUCCESS_ID; + } + + std::vector::iterator sbItr=(sd->getSegments().begin()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; +#endif + LinguisticGraphVertex sentenceBegin = sbItr->getFirstVertex(); + LinguisticGraphVertex sentenceEnd = sbItr->getLastVertex(); + + + const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_d->m_language); +// for (auto im=m_d->m_conllLimaDepMapping.begin();im!=m_d->m_conllLimaDepMapping.end();im++) +// { +// LDEBUG << "("<< (*im).first<< "," << (*im).second << ")" << endl; +// } + + LimaConllTokenIdMapping* limaConllTokenIdMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); + if (limaConllTokenIdMapping == 0) + { + limaConllTokenIdMapping = new LimaConllTokenIdMapping(); + analysis.setData("LimaConllTokenIdMapping", limaConllTokenIdMapping); + } + int sentenceNb=0; + + while (sbItr != sd->getSegments().end() ) //for each sentence + { + sentenceNb++; + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + std::mapsegmentationMapping;//mapping the two types of segmentations (Lima and conll) + std::mapsegmentationMappingReverse; + +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process begin - end: " << sentenceBegin << " - " << sentenceEnd; +#endif + //LinguisticGraphOutEdgeIt outItr,outItrEnd; + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + int tokenId = 0; + LinguisticGraphVertex v = 0; + while (v != sentenceEnd && !toVisit.empty()) + + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Vertex index : " << v; +#endif + visited.insert(v); + segmentationMapping.insert(std::make_pair(v,tokenId)); + segmentationMappingReverse.insert(std::make_pair(tokenId,v)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll id : " << tokenId << " Lima id : " << v; +#endif + DependencyGraphVertex dcurrent = syntacticData->depVertexForTokenVertex(v); + DependencyGraphOutEdgeIt dit, dit_end; + boost::tie(dit,dit_end) = boost::out_edges(dcurrent,*depGraph); + for (; dit != dit_end; dit++) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Dumping dependency edge " << (*dit).m_source << " -> " << (*dit).m_target; +#endif + try + { + CEdgeDepRelTypePropertyMap typeMap = get(edge_deprel_type, *depGraph); + SyntacticRelationId type = typeMap[*dit]; + std::string syntRelName=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getSyntacticRelationName(type); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process relation = " << syntRelName; + LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); + LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); + LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; + LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); +#endif + LinguisticGraphVertex dest = syntacticData->tokenVertexForDepVertex(boost::target(*dit, *depGraph)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Targ : Morph vertex= " << dest; +#endif + if (syntRelName!="") + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process saving target for" << v << ":" << dest << syntRelName; +#endif + vertexDependencyInformations.insert(std::make_pair(v, std::make_pair(dest,syntRelName))); + } + } + catch (const std::range_error& ) + { + } + catch (...) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process: catch others....."; +#endif + throw; + } + } + if (v == sentenceEnd) + { + continue; + } + LinguisticGraphOutEdgeIt outItr,outItrEnd; + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + ++tokenId; + } + + // instead of looking to all vertices, follow the graph (in + // morphological graph, some vertices are not related to main graph: + // idiomatic expressions parts and named entity parts) + + toVisit.clear(); + visited.clear(); + + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + + // get the list of predicates for the current sentence + QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); +#ifdef DEBUG_LP + //LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; +#endif + QList< LinguisticGraphVertex > keys = predicates.keys(); + + toVisit.enqueue(sentenceBegin); + tokenId=0; + v=0; + while (!toVisit.empty() && v!=sentenceEnd) + { //as long as there are vertices in the sentence + v = toVisit.dequeue(); + + Token* ft=get(vertex_token,*graph,v); + MorphoSyntacticData* morphoData=get(vertex_data,*graph, v); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process PosGraph token" << v; +#endif + if( morphoData!=0 && !morphoData->empty() && ft != 0) + { + const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); + const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process graphTag:" << micro; +#endif + + std::string inflectedToken=ft->stringForm().toUtf8().constData(); + std::string lemmatizedToken; + if (morphoData != 0 && !morphoData->empty()) + { + lemmatizedToken=sp[(*morphoData)[0].lemma].toUtf8().constData(); + } + + QString neType = QString::fromUtf8("_") ; + std::set< AnnotationGraphVertex > anaVertices = annotationData->matches("PosGraph",v,"AnalysisGraph"); + // note: anaVertices size should be 0 or 1 + for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); + anaVerticesIt != anaVertices.end(); anaVerticesIt++) + { + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + it != matches.end(); it++) + { + AnnotationGraphVertex vx=*it; + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + const SpecificEntityAnnotation* se = + annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + neType = Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + break; + } + } + if (neType != "_") break; + } + QString conllRelName = "_"; + int targetConllId = 0; + if (vertexDependencyInformations.count(v)!=0) + { + LinguisticGraphVertex target=vertexDependencyInformations.find(v)->second.first; +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; +#endif + if (segmentationMapping.find(target) != segmentationMapping.end()) + { + targetConllId=segmentationMapping.find(target)->second; + } + else + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process target" << target << "not found in segmantation mapping"; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; +#endif + QString relName = QString::fromUtf8(vertexDependencyInformations.find(v)->second.second.c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process the lima dependency tag for " + << ft->stringForm()<< " is " << relName; +#endif + if (m_d->m_conllLimaDepMapping.contains(relName)) + { + conllRelName=m_d->m_conllLimaDepMapping[relName]; + } + else + { + conllRelName= relName; +// LERROR << "ConllDumper::process" << relName << "not found in mapping"; + } + } + // Modified CONLL-X format with an extra named entity type column + // http://ilk.uvt.nl/conll/#dataformat + // 1 ID Token counter, starting at 1 for each new sentence. + // 2 FORM Word form or punctuation symbol. + // 3 LEMMA Lemma or stem (depending on particular data set) of word form, or an underscore if not available. + // 4 CPOSTAG Coarse-grained part-of-speech tag, where tagset depends on the language. + // 5 POSTAG Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. + // 6 NER Extra column: Named entity type + // 7 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. + // 8 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + // 9 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 10 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). + // 11 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + + QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "_"; + dstream->out() << tokenId + << "\t" << inflectedToken + << "\t" << lemmatizedToken + << "\t" << macro.toUtf8().constData() + << "\t" << micro.toUtf8().constData() + << "\t" << neType.toUtf8().constData() + << "\t" << "_" + << "\t" << targetConllIdString.toUtf8().constData() + << "\t" << conllRelName.toUtf8().constData() + << "\t" << "_" + << "\t" << "_"; + if (!predicates.isEmpty()) + { + dstream->out() << "\t"; +// LDEBUG << "ConllDumper::process output the predicate if any"; + if (!predicates.contains(v)) + { + // No predicate for this token + dstream->out() << "_"; + } + else + { + // This token is a predicate, output it + QString predicateAnnotation = annotationData->stringAnnotation(predicates.value(v),"Predicate"); + dstream->out() << predicateAnnotation; + } + + // Now output the roles supported by the current PoS graph token +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process output the roles for the" << keys.size() << "predicates"; +#endif + for (int i = 0; i < keys.size(); i++) + { + // There will be one column for each predicate. Output the + // separator right now + dstream->out() << "\t"; + AnnotationGraphVertex predicateVertex = predicates.value(keys[keys.size()-1-i]); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + if (vMatches.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '_'."; +#endif + dstream->out() << "_"; + } + else + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process there is"<getGraph()); + for (; vMatchInEdgesIt != vMatchInEdgesIt_end; vMatchInEdgesIt++) + { + AnnotationGraphVertex inVertex = boost::source(*vMatchInEdgesIt, annotationData->getGraph()); + std::set< LinguisticGraphVertex > inVertexAnnotPosGraphMatches = annotationData->matches("annot",inVertex,"PosGraph"); + if (inVertex == predicateVertex && !inVertexAnnotPosGraphMatches.empty()) + { + // Current edge is holding a role of the current predicate + roleAnnotation = annotationData->stringAnnotation(*vMatchInEdgesIt,"SemanticRole"); + break; + } + else + { + // Current edge does not hold a role of the current predicate +// dstream->out() << "_"; + } + } + if (roleAnnotation != "_") break; + } + dstream->out() << roleAnnotation.toUtf8().constData(); + } + } + } + dstream->out() << std::endl; + } + + if (v == sentenceEnd) + { + continue; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process look at out edges of" << v; +#endif + LinguisticGraphOutEdgeIt outIter,outIterEnd; + for (boost::tie(outIter,outIterEnd) = boost::out_edges(v,*graph); outIter!=outIterEnd; outIter++) + { + LinguisticGraphVertex next = boost::target(*outIter,*graph); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process looking out vertex" << next; +#endif + if (!visited.contains(next)) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process enqueuing" << next; +#endif + visited.insert(next); + toVisit.enqueue(next); + } + } + tokenId++; + } + dstream->out() << std::endl; + limaConllTokenIdMapping->insert(std::make_pair(sentenceNb, segmentationMappingReverse)); + sbItr++; + } + + return SUCCESS_ID; + +} + +QMultiMap ConllDumperPrivate::collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd) +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; +#endif + QMap result; + + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + + AnalysisGraph* tokenList=static_cast(analysis.getData(m_graph)); + if (tokenList==0) { + DUMPERLOGINIT; + LERROR << "graph " << m_graph << " has not been produced: check pipeline"; + return result; + } + LinguisticGraph* graph=tokenList->getGraph(); + + + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + LinguisticGraphVertex v = 0; + while (v!=sentenceEnd && !toVisit.empty()) + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens vertex:" << v; +#endif + visited.insert(v); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + for (auto it = vMatches.begin(); it != vMatches.end(); it++) + { + AnnotationGraphVertex vMatch = *it; + if (annotationData->hasStringAnnotation(vMatch,"Predicate")) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens insert" << v << vMatch; +#endif + result.insert(v, vMatch); + } + } + LinguisticGraphOutEdgeIt outItr,outItrEnd;bool newSentence(const QString & line); + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + } + return result; +} + +} // end namespace +} // end namespace +} // end namespace + + + + + +#else + /* + + Copyright 2002-2014 CEA LIST This file is part of LIMA. @@ -20,6 +699,7 @@ #include "ConllDumper.h" #include "common/MediaProcessors/DumperStream.h" #include "common/time/traceUtils.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/MediaticData/mediaticData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" @@ -381,7 +1061,7 @@ LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const // get the list of predicates for the current sentence QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); #ifdef DEBUG_LP - LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; + //LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; #endif QList< LinguisticGraphVertex > keys = predicates.keys(); @@ -659,3 +1339,6 @@ QMultiMap ConllDumperPrivate::coll } // end namespace } // end namespace } // end namespace + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp index 826678ff6..31b3eccd0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -1,3 +1,12 @@ + +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + /* Copyright 2016 CEA LIST @@ -17,6 +26,9 @@ along with LIMA. If not, see */ +// Ici sinon compile pas +#include + #include "KnowledgeBasedSemanticRoleLabeler.h" #include "common/Data/LimaString.h" @@ -33,6 +45,470 @@ #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" +#include + + +#include + + + +using namespace std; +using namespace Lima::LinguisticProcessing::AnalysisDumpers; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::Misc; + + +#define HANDLE_ERROR(Y,Z) if ( Y ) Z ; +#define HANDLE_ERROR_EQUAL(X,Y,Z) if ( X == Y ) Z ; +#define HANDLE_ERROR_RETURN(X,Y,Z) if ( X ) { Y ; return Z; } +#define HANDLE_ERROR_EQUAL_RETURN(X,Y,Z,R) if ( X == Y ) { Z ; return R ; } +#define HANDLE_ERROR_DIFFERENT(X,Y,Z) if ( X != Y ) Z ; +#define HANDLE_ERROR_DIFFERENT_RETURN(X,Y,Z,R) if ( X != Y ) { Z ; return R ; } + + + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +static SimpleFactory knowledgeBasedSemanticRoleLabelerFactory(KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID); + + +class KnowledgeBasedSemanticRoleLabelerPrivate +{ +public: + KnowledgeBasedSemanticRoleLabelerPrivate(); + virtual ~KnowledgeBasedSemanticRoleLabelerPrivate(); + + PyObject* m_instance; + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_inputSuffix; + QString m_outputSuffix; + QString m_temporaryFileMetadata; +}; + +KnowledgeBasedSemanticRoleLabelerPrivate::KnowledgeBasedSemanticRoleLabelerPrivate() : + m_instance(0), + m_dumper(new ConllDumper()) +{} + +KnowledgeBasedSemanticRoleLabelerPrivate::~KnowledgeBasedSemanticRoleLabelerPrivate() +{ +} + +KnowledgeBasedSemanticRoleLabeler::KnowledgeBasedSemanticRoleLabeler() : m_d(new KnowledgeBasedSemanticRoleLabelerPrivate()) +{} + + +KnowledgeBasedSemanticRoleLabeler::~KnowledgeBasedSemanticRoleLabeler() +{ + delete m_d; +} + +auto failed_to_import_the_sys_module = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import the sys module"; + PyErr_Print(); +}; + +auto cannot_instantiate_the_semanticrolelabeler_python_class = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; + PyErr_Print(); + Py_Exit(1); +}; + +void KnowledgeBasedSemanticRoleLabeler::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ +#ifdef DEBUG_LP + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler::init"; +#endif + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_d->m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'dumper' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_d->m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_d->m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'loader' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value (empty) + } + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + try { + m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + } + QString path; + QString mode = "VerbNet"; + QString kbsrlLogLevel = "error"; + + try + { + kbsrlLogLevel = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("loglevel").c_str()); + } + catch (NoSuchParam& ) + { + // keep default + } + + try + { + path = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("path").c_str()); + } + catch (NoSuchParam& ) + { + SEMANTICANALYSISLOGINIT; + LERROR << "no param 'path' in KnowledgeBasedSemanticRoleLabeler group configuration"; + throw InvalidConfiguration(); + } + + try + { + mode = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("mode").c_str()); + if (mode != "VerbNet" && mode != "FrameNet") + { + SEMANTICANALYSISLOGINIT; + LERROR << "Unknown semantic annotation mode" << mode; + throw InvalidConfiguration(); + } + } + catch (NoSuchParam& ) + { + // keep default + } + + // Initialize the python SRL system + /* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); +#ifdef ANTINNO_ASFALDA + str_program_name = "c://python24//python.exe"; + wchar_t* pythonPath = L"c://python24//python.exe"; + //Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); + // Erreur de link que je pige pas, je triche... + Py_SetProgramName(pythonPath); +#else + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } +#ifndef WIN32 + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif +#endif + + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + HANDLE_ERROR_EQUAL (sys_module, NULL, failed_to_import_the_sys_module() ); + + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("D:/telechargement Amose/knowledgesrl/src")) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to append to python path"; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import srl semanticrolelabeler module"; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[sss]", + QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData(), + QString("--frame-lexicon=%1").arg(mode).toUtf8().constData(), + QString("--language=%1").arg(Lima::Common::MediaticData::MediaticData::single().getMediaId(language).c_str()).toUtf8().constData()); + HANDLE_ERROR_EQUAL(m_d->m_instance,NULL,cannot_instantiate_the_semanticrolelabeler_python_class()) +} + +auto metadata_equal_zero = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; +}; + +auto temporary_file_not_open = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to create temporary file"; +}; + +auto temporary_file_srl_not_open = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file for dumping SRL CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); +}; + +auto failed_to_load_data_from_temporary_file = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file" << temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." << temporaryFile->fileName(); + temporaryFile->setAutoRemove(false); +}; + +auto failure_during_call_of_the_annotate_method_on = [](QString& conllInput) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failure during call of the annotate method on" << conllInput; + PyErr_Print(); + Py_Exit(1); +}; + +LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( + AnalysisContent& analysis) const +{ + TimeUtilsController knowledgeBasedSemanticRoleLabelerProcessTime("KnowledgeBasedSemanticRoleLabeler"); + SEMANTICANALYSISLOGINIT; + LINFO << "start SRL process"; + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + HANDLE_ERROR_EQUAL_RETURN(metadata,0,metadata_equal_zero(),MISSING_DATA) + + QScopedPointer temporaryFile; + if (!m_d->m_temporaryFileMetadata.isEmpty()) + { + QScopedPointer otherTemp(new QTemporaryFile()); + temporaryFile.swap(otherTemp); + HANDLE_ERROR_RETURN(!temporaryFile->open(),temporary_file_not_open(),CANNOT_OPEN_FILE_ERROR); + metadata->setMetaData(m_d->m_temporaryFileMetadata.toUtf8().constData(), + temporaryFile->fileName().toUtf8().constData()); + } + + // Use CoNLL duper to produce the input to the SRL + LimaStatusCode returnCode(SUCCESS_ID); + returnCode=m_d->m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to dump data to temporary file"; + return returnCode; + } + + QString conllInput; + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename; + if (!m_d->m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_d->m_inputSuffix; + } + QFile inputFile(inputFilename); + inputFile.open(QIODevice::ReadOnly); + conllInput = QString::fromUtf8(inputFile.readAll().constData()); + inputFile.close(); + } + else + { + if (!temporaryFile->open()) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file after dumping CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return CANNOT_OPEN_FILE_ERROR; + } + conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); +#ifdef DEBUG_LP + temporaryFile->setAutoRemove(false); + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler: keeping temporary file after dumping CoNLL data to it for debugging"<< temporaryFile->fileName(); +#endif + temporaryFile->close(); + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "s", conllInput.toUtf8().constData()); + HANDLE_ERROR_EQUAL(callResult, NULL, failure_during_call_of_the_annotate_method_on(conllInput)); + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + LERROR << "Cannot convert result item to string"; + PyErr_Print(); + Py_Exit(1); + } + LDEBUG << "Python result is:" << result; + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString outputFilename; + if (!m_d->m_outputSuffix.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + outputFilename = fileName + m_d->m_outputSuffix; + } + QFile outputFile(outputFilename); + outputFile.open(QIODevice::WriteOnly); + outputFile.write(result); + outputFile.close(); + } + else + { + HANDLE_ERROR_RETURN( !temporaryFile->open(), + temporary_file_srl_not_open(temporaryFile), CANNOT_OPEN_FILE_ERROR); + if (!temporaryFile->seek(0)) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to seek to the beginning of temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + if (temporaryFile->write(result) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to write SRL result to temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + temporaryFile->close(); + } + Py_DECREF(callResult); + // Import the CoNLL result + returnCode=m_d->m_loader->process(analysis); + HANDLE_ERROR_DIFFERENT_RETURN(returnCode,SUCCESS_ID,failed_to_load_data_from_temporary_file(temporaryFile),returnCode) + + + return returnCode; +} + +} //namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima + + + + + + + +#else + + + + + + + +// version master + +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "KnowledgeBasedSemanticRoleLabeler.h" + +#include "common/Data/LimaString.h" +#include "common/misc/Exceptions.h" +#include "common/Data/strwstrtools.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/SemanticAnalysis/ConllDumper.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" + #include #include #include @@ -125,6 +601,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( m_d->m_dumper=manager->getObject(dumperName); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; LERROR << "Missing 'dumper' parameter in KnowledgeBasedSemanticRoleLabeler group for language " << (int)language << " !"; throw InvalidConfiguration(); @@ -139,6 +616,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( m_d->m_loader = 0; } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; LERROR << "Missing 'loader' parameter in KnowledgeBasedSemanticRoleLabeler group for language " << (int)language << " !"; throw InvalidConfiguration(); @@ -227,7 +705,11 @@ void KnowledgeBasedSemanticRoleLabeler::init( break; } } +#ifndef WIN32 Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif Py_Initialize(); @@ -243,6 +725,7 @@ void KnowledgeBasedSemanticRoleLabeler::init( PyObject* pythonpath = PySys_GetObject("path"); if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault(path.toUtf8().constData())) == -1) { + SEMANTICANALYSISLOGINIT; LERROR << "Failed to append to python path"; PyErr_Print(); Py_Exit(1); @@ -306,8 +789,10 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( AnalysisContent& analysis) const { TimeUtilsController knowledgeBasedSemanticRoleLabelerProcessTime("KnowledgeBasedSemanticRoleLabeler"); +#ifdef DEBUG_LP SEMANTICANALYSISLOGINIT; LINFO << "start SRL process"; +#endif LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); HANDLE_ERROR_EQUAL_RETURN(metadata,0,metadata_equal_zero(),MISSING_DATA) @@ -326,6 +811,7 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( LimaStatusCode returnCode(SUCCESS_ID); returnCode=m_d->m_dumper->process(analysis); if (returnCode!=SUCCESS_ID) { + SEMANTICANALYSISLOGINIT; LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to dump data to temporary file"; return returnCode; } @@ -375,11 +861,14 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( char* result = PyUnicode_AsUTF8(callResult); if (result == NULL) { + SEMANTICANALYSISLOGINIT; LERROR << "Cannot convert result item to string"; PyErr_Print(); Py_Exit(1); } +#ifdef DEBUG_LP LDEBUG << "Python result is:" << result; +#endif if (m_d->m_temporaryFileMetadata.isEmpty()) { QString outputFilename; @@ -420,10 +909,17 @@ LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( returnCode=m_d->m_loader->process(analysis); HANDLE_ERROR_DIFFERENT_RETURN(returnCode,SUCCESS_ID,failed_to_load_data_from_temporary_file(temporaryFile),returnCode) - return returnCode; } } //namespace SemanticAnalysis } // namespace LinguisticProcessing } // namespace Lima + + + + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp index c35328c5a..535831b19 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp @@ -1,3 +1,307 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRelationsXmlLogger.cpp + * @author (romaric.besancon@cea.fr) + * @date Mon Sep 17 2007 + * copyright Copyright (C) 2007 by CEA LIST + * + ***********************************************************************/ + + +#include "SemanticRelationsXmlLogger.h" +#include "SemanticRelationAnnotation.h" +#include "SemanticAnnotation.h" + +#include "common/MediaticData/mediaticData.h" +#include "common/Data/strwstrtools.h" +#include "common/time/traceUtils.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +//#include "common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" + +#include + +#define SEMLOGINIT LOGINIT("LP::SemanticAnalysis") + +using namespace std; +using namespace boost; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; + +namespace Lima { +namespace LinguisticProcessing { +namespace SemanticAnalysis { + +SimpleFactory +semanticRelationsXmlLoggerFactory(SEMANTICRELATIONSXMLLOGGER_CLASSID); + +SemanticRelationsXmlLogger::SemanticRelationsXmlLogger() : +AbstractLinguisticLogger(".output.xml"), +m_language(0), +m_graph("PosGraph") +{} + + +SemanticRelationsXmlLogger::~SemanticRelationsXmlLogger() +{} + +void SemanticRelationsXmlLogger::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + AbstractLinguisticLogger::init(unitConfiguration,manager); + + m_language=manager->getInitializationParameters().media; + + try + { + m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + SEMLOGINIT; + LWARN << "No 'graph' parameter in unit configuration '" + << unitConfiguration.getName() << "' ; using PosGraph"; + m_graph=string("PosGraph"); + } +} + +LimaStatusCode SemanticRelationsXmlLogger:: +process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + + SEMLOGINIT; + LERROR << "SemanticRelationsXmlLogger"; + + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + + const LinguisticAnalysisStructure::AnalysisGraph& graph = + *(static_cast(analysis.getData(m_graph))); + + LinguisticGraph* lingGraph = const_cast(graph.getGraph()); + VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + SEMLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + + ofstream out; + if (!openLogFile(out,metadata->getMetaData("FileName"))) { + SEMLOGINIT; + LERROR << "Can't open log file "; + return UNKNOWN_ERROR; + } + + uint64_t offset(0); + try { + offset=atoi(metadata->getMetaData("StartOffset").c_str()); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + uint64_t offsetIndexingNode(0); + try { + offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str()); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + std::string docId(""); + try { + docId=metadata->getMetaData("DocId"); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + out << "" << endl; + +// LDEBUG << "SemanticRelationsXmlLogger on graph " << m_graph; + + //look at all vertices for annotations + AnnotationGraphVertexIt itv, itv_end; + boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); + for (; itv != itv_end; itv++) + { + LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv; + if (annotationData->hasAnnotation(*itv,("SemanticAnnotation"))) + { +// LDEBUG << " it has SemanticRelationAnnotation"; + const SemanticAnnotation* annot = 0; + try + { + annot = annotationData->annotation(*itv,("SemanticAnnotation")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) + { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticRelation"; + continue; + } + + // output + out << "getType() << "\">" << endl + << vertexStringForSemanticAnnotation("vertex",*itv,tokenMap,annotationData,offset) + << "" << endl; + } + } + + // look at all edges for relations + AnnotationGraphEdgeIt it,it_end; + const AnnotationGraph& annotGraph=annotationData->getGraph(); + boost::tie(it, it_end) = edges(annotGraph); + for (; it != it_end; it++) { + LDEBUG << "SemanticRelationsXmlLogger on annotation edge " + << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()); + if (annotationData->hasAnnotation(*it,("SemanticRelation"))) + { + SEMLOGINIT; + LDEBUG << "found semantic relation"; + const SemanticRelationAnnotation* annot = 0; + try + { + annot = annotationData->annotation(*it,("SemanticRelation")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) + { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticAnnotation"; + continue; + } + + //output + out << "type() << "\">" << endl + << vertexStringForSemanticAnnotation("source",source(*it,annotGraph),tokenMap,annotationData,offset) + << vertexStringForSemanticAnnotation("target",target(*it,annotGraph),tokenMap,annotationData,offset) + << "" << endl; + + } + } + +// LDEBUG << " all vertices done"; + out << "" << endl; + out.close(); + + TimeUtils::logElapsedTime("SemanticRelationsXmlLogger"); + return SUCCESS_ID; +} + +std::string SemanticRelationsXmlLogger:: +vertexStringForSemanticAnnotation(const std::string& vertexRole, + const AnnotationGraphVertex& vertex, + const VertexTokenPropertyMap& tokenMap, + AnnotationData* annotationData, + uint64_t offset) const +{ + ostringstream oss; + + // get id of the corresponding vertex in analysis graph + LinguisticGraphVertex v; + if (!annotationData->hasIntAnnotation(vertex,Common::Misc::utf8stdstring2limastring(m_graph))) + { + // SEMLOGINIT; + // LDEBUG << *itv << " has no " << m_graph << " annotation. Skeeping it."; + return ""; + } + v = annotationData->intAnnotation(vertex,Common::Misc::utf8stdstring2limastring(m_graph)); + LinguisticAnalysisStructure::Token* vToken = tokenMap[v]; + // LDEBUG << "SemanticRelationsXmlLogger tokenMap[" << v << "] = " << vToken; + if (vToken == 0) + { + SEMLOGINIT; + LERROR << "Vertex " << v << " has no entry in the analysis graph token map. This should not happen !!"; + return ""; + } + + // get annotation : element in relation can be an entity => get entity type + // otherwise, its type is "token" + std::string type("token"); + + auto matches = annotationData->matches(m_graph,v,"annot"); + for (auto it = matches.begin(); it != matches.end(); it++) + { + if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { + const SpecificEntityAnnotation* annot = 0; + try { + annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticAnnotation"; + continue; + } + type=Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityName(annot->getType())); + break; + } + } + + oss << " <" << vertexRole + << " type=\"" << type << "\"" + << " pos=\"" << offset+vToken->position() << "\"" + << " len=\"" << vToken->length() << "\"" + << " string=\"" << vToken->stringForm() << "\"" + << "/>" << endl; + return oss.str(); +} + + +} // SemanticAnalysis +} // LinguisticProcessing +} // Lima + + + + + +#else + + +// version master + + /* Copyright 2002-2013 CEA LIST @@ -6,6 +310,8 @@ LIMA is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or + + (at your option) any later version. LIMA is distributed in the hope that it will be useful, @@ -25,10 +331,14 @@ * ***********************************************************************/ + #include "SemanticRelationsXmlLogger.h" #include "SemanticRelationAnnotation.h" #include "SemanticAnnotation.h" +// #include "common/linguisticData/linguisticData.h" +//#include "common/misc/strwstrtools.h" +//#include "common/misc/traceUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/Data/strwstrtools.h" #include "common/time/traceUtils.h" @@ -279,3 +589,7 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, } // SemanticAnalysis } // LinguisticProcessing } // Lima + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h index 53d135d57..29dfb7492 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h @@ -1,3 +1,12 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronise + + + /* Copyright 2002-2013 CEA LIST @@ -78,3 +87,97 @@ class SemanticRelationsXmlLogger : public AbstractLinguisticLogger } // Lima #endif + + + +#else + + +// version master + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRelationsXmlLogger.h + * @author Romaric Besancon (romaric.besancon@cea.fr) + * @date Wed Sep 12 2007 + * copyright Copyright (C) 2007 by CEA LIST + * Project s2lp + * + * @brief xml logger for the semantic relation annotations from the + * annotation graph + * + * + ***********************************************************************/ + +#ifndef SEMANTICRELATIONSXMLLOGGERSEMANTICRELATIONSXMLLOGGER_H +#define SEMANTICRELATIONSXMLLOGGERSEMANTICRELATIONSXMLLOGGER_H + +#include "linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h" +#include "linguisticProcessing/core/LinguisticProcessors/AbstractLinguisticLogger.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +#define SEMANTICRELATIONSXMLLOGGER_CLASSID "SemanticRelationsXmlLogger" +class SemanticRelationsXmlLogger : public AbstractLinguisticLogger +{ +public: + SemanticRelationsXmlLogger(); + + virtual ~SemanticRelationsXmlLogger(); + + virtual void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + virtual LimaStatusCode process( + AnalysisContent& analysis) const; + +private: + MediaId m_language; + std::string m_graph; + + // private memeber functions + std::string vertexStringForSemanticAnnotation(const std::string& vertexRole, + const LinguisticGraphVertex& vertex, + const VertexTokenPropertyMap& tokenMap, + Common::AnnotationGraphs::AnnotationData* annotationData, + uint64_t offset) const; + +}; + +} // SemanticAnalysis +} // LinguisticProcessing +} // Lima + +#endif + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index dd074ac27..575fdb5a3 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -1,3 +1,13 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + + /* Copyright 2002-2014 CEA LIST @@ -27,7 +37,9 @@ #include "SemanticRoleLabelingLoader.h" #include "LimaConllTokenIdMapping.h" - +#ifdef ANTINNO_SPECIFIC +#include +#endif #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/Data/strwstrtools.h" #include "common/MediaticData/mediaticData.h" @@ -66,9 +78,14 @@ namespace SemanticAnalysis { SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "[\\r\\n]" +#define CONLLFIELDSEPARATOR "\\t" +/* #define NBCOLSINSRLBEFOREFRAME 11 #define CONLLTOKENSEPARATOR "\n+" #define CONLLFIELDSEPARATOR "\t" +*/ // Conll handler struct ConllHandler @@ -184,6 +201,7 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); LimaConllTokenIdMapping* limaConllMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); +<<<<<<< .mine LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); if (metadata == 0) { @@ -192,6 +210,7 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co } QString fileName = QString::fromUtf8((metadata->getMetaData("FileName")+m_inputFileExtension).c_str()); + QFile file(fileName); @@ -232,13 +251,14 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co { LinguisticGraphVertex posGraphPredicateVertex=cHandler.m_verbalClasses[vClassIndex].first; QStringList verbalClasses = cHandler.m_verbalClasses[vClassIndex].second.split("|"); - for (QString& verbalClass: verbalClasses) - { + //for (QString& verbalClass: verbalClasses) + // Modif NAN compatibilité de compilation + for (QStringList::iterator it=verbalClasses.begin(); it!=verbalClasses.end(); ++it) + { + QString& verbalClass = *it; verbalClass = m_d->m_model + "." + verbalClass; } LimaString verbalClass= verbalClasses.join("|"); - - AnnotationGraphVertex annotPredicateVertex=annotationData->createAnnotationVertex(); annotationData->addMatching("PosGraph", posGraphPredicateVertex, "annot", annotPredicateVertex); annotationData->annotate(annotPredicateVertex, "Predicate", verbalClass); @@ -250,15 +270,18 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co std::vector >::iterator semRoleIt; for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; + QStringList semanticRoles = (*semRoleIt).second.split("|"); - for (QString& semanticRole: semanticRoles) + //for (QString& semanticRole: semanticRoles) + // Modif NAN compatibilité de compilation + for (QStringList::iterator it=semanticRoles.begin(); it!=semanticRoles.end(); ++it) { - if (!semanticRole.isEmpty()) - semanticRole = m_d->m_model + "." + semanticRole; + QString& semanticRole = *it; + if (!semanticRole.isEmpty()) + semanticRole = m_d->m_model + "." + semanticRole; } LimaString semanticRole= semanticRoles.join("|"); - - AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); + AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); AnnotationGraphEdge roleEdge=annotationData->createAnnotationEdge(annotPredicateVertex, annotRoleVertex); annotationData->annotate(roleEdge, "SemanticRole", semanticRole); annotationData->addMatching("PosGraph", posGraphRoleVertex, "annot", annotRoleVertex); @@ -297,6 +320,10 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap SEMANTICANALYSISLOGINIT; ConllHandler cHandler(m_language, m_analysis, m_graph); QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); + if (sentenceTokens.isEmpty()) + { + return false; + } QString firstSentenceToken=(*sentenceTokens.constBegin()); int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); m_verbalClassNb = descriptorsNb - NBCOLSINSRLBEFOREFRAME - 1; @@ -311,7 +338,11 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap m_semanticRoles.clear(); m_semanticRoles.resize(m_verbalClassNb); //repeated on each token of the sentence, that is on each line +#ifdef ANTINNO_SPECIFIC +BOOST_FOREACH (const auto & token, sentenceTokens) +#else for (const auto & token: sentenceTokens) +#endif { int roleNumbers=0; QStringList descriptors=cHandler.splitSegment(token,m_descriptorSeparator); @@ -338,8 +369,457 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap m_verbalClasses[classIndex]=qMakePair(limaTokenId, vClass); classIndex++; } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH (auto roleTargetFieldIndex, boost::irange(0,m_verbalClassNb)) +#else + for (auto roleTargetFieldIndex : boost::irange(0,m_verbalClassNb)) +#endif + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation"<<"nb descriptors and roleTargetFieldIndex" << descriptors.size() << roleTargetFieldIndex ; +#endif + if (NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex >= descriptors.size()) + { + LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; + break; + } + if (descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]!="_") + { + QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]; + + LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); + if(limaTokenId!=0) + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation argument "<> sRoles; + if (roleTargetFieldIndex >= m_semanticRoles.size()) + { + LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error 2" << roleTargetFieldIndex; + break; + } + m_semanticRoles[roleTargetFieldIndex].push_back(make_pair(limaTokenId,semanticRoleLabel)); + } + roleNumbers++; + } + } + } + } + } + return classIndex != 0; +} + + + +QStringList ConllHandler::splitSegment(const QString & segment, QRegExp separator) +{ + QStringList segmentsSplited; + segmentsSplited =segment.split(QRegExp(separator),QString::SkipEmptyParts); + return segmentsSplited; +} + +LinguisticGraphVertex ConllHandler::getLimaTokenId(int conllTokenId, int sentenceI, LimaConllTokenIdMapping* limaConllMapping) +{ + SEMANTICANALYSISLOGINIT; + std::map< int,std::map< int,LinguisticGraphVertex>>::iterator limaConllMappingIt; + limaConllMappingIt=limaConllMapping->find(sentenceI); + if (limaConllMappingIt == limaConllMapping->end()) + { + LERROR << "Sentence " << sentenceI << " not found"; + return 0; + } + std::map< int,LinguisticGraphVertex> limaConllId=(*limaConllMappingIt).second; + std::map< int,LinguisticGraphVertex>::iterator limaConllIdIt=limaConllId.find(conllTokenId); + if (limaConllIdIt==limaConllId.end()) + { + LERROR << "Conll token id " << conllTokenId << " not found"; + return 0; + } + LinguisticGraphVertex limaTokenId=limaConllIdIt->second; + return limaTokenId; +} + +} +} +} // end namespace + + + + + + +#else + + +// version master + + + + +/* + Copyright 2002-2014 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRoleLabelingLoader.cpp + * @author Clémence Filmont + * @author Gael de Chalendar + * @date 2014 + * copyright Copyright (C) 2014-2016 by CEA LIST + ***********************************************************************/ + +#include "SemanticRoleLabelingLoader.h" +#include "LimaConllTokenIdMapping.h" +#ifdef ANTINNO_SPECIFIC +#include +#endif +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" +#include "linguisticProcessing/core/Automaton/recognizerData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" + +#include +#include "QStringList" +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::ApplyRecognizer; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SemanticAnalysis; +using namespace Lima::Common::AnnotationGraphs; + + +namespace Lima { +namespace LinguisticProcessing { +namespace SemanticAnalysis { + +SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); + +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "\n+" +#define CONLLFIELDSEPARATOR "\t" + +// Conll handler +struct ConllHandler +{ + ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph); + virtual ~ConllHandler(); + + /** + * @brief extract semantic annotations associated to token + * @param sentenceIndex the index of the current sentence + * @param limaConllMapping the chosen lima conll token id mapping + * @param sentence the current sentence + * @return true if any verbal class is found, false otherwise + */ + bool extractSemanticInformation(int sentenceIndex, LimaConllTokenIdMapping* limaConllMapping, const QString & sentence); + + /** + * @brief split a text into different types segments + * @param segment the segment to split + * @param separator the separator used to split + * @return the segment split + */ + QStringList splitSegment(const QString & segment, QRegExp separator); + + /** + * @brief get the lima token id matching any conll token one from the same text + * @param conllTokenId the conll token id one search the matched lima id + * @param sentenceNb the index of the current sentence + * @param limaConllMapping the chosen lima conll token id mapping + * @return the lima token id + * @note function to put in the LimaConllTokenIdMapping class? + */ + LinguisticGraphVertex getLimaTokenId(int conllTokenId, int sentenceIndex, LimaConllTokenIdMapping* limaConllMapping); + + + MediaId m_language; + AnalysisContent& m_analysis; + LinguisticAnalysisStructure::AnalysisGraph* m_graph; + QRegExp m_descriptorSeparator; + QRegExp m_tokenSeparator; + QVector< QPair > m_verbalClasses; + QVector < std::vector> >m_semanticRoles; + int m_verbalClassNb; +}; + + +class SemanticRoleLabelingLoaderPrivate +{ + friend class SemanticRoleLabelingLoader; + SemanticRoleLabelingLoaderPrivate(); + ~SemanticRoleLabelingLoaderPrivate(); + + MediaId m_language; + std::string m_graph; + QString m_model; +}; + + + +//*********************************************************************** +SemanticRoleLabelingLoaderPrivate::SemanticRoleLabelingLoaderPrivate(): +m_language(0), +m_graph("PosGraph"), +m_model("VerbNet") +{} + +SemanticRoleLabelingLoaderPrivate::~SemanticRoleLabelingLoaderPrivate() +{ +} + +//*********************************************************************** +SemanticRoleLabelingLoader::SemanticRoleLabelingLoader(): + AnalysisLoader(), + m_d(new SemanticRoleLabelingLoaderPrivate()) +{ +} + +SemanticRoleLabelingLoader::~SemanticRoleLabelingLoader() +{ + delete m_d; +} + +//*********************************************************************** + +void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, Manager* manager) +{ + + SEMANTICANALYSISLOGINIT; + m_d->m_language=manager->getInitializationParameters().media; + AnalysisLoader::init(unitConfiguration,manager); + try + { + m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_model = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("model").c_str()); + } + catch (NoSuchParam& ) {} // keep default value +} + + +LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) const +{ + SEMANTICANALYSISLOGINIT; + AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph)); + if (tokenList==0) + { + LERROR << "graph " << m_d->m_graph << " has not been produced: check pipeline" ; + return MISSING_DATA; + } + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + LimaConllTokenIdMapping* limaConllMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); + + QString fileName = getInputFile(analysis); +======= + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } +>>>>>>> .r8104 + + QString fileName = QString::fromUtf8((metadata->getMetaData("FileName")+m_inputFileExtension).c_str()); + + QFile file(fileName); + + + if (!file.open(QIODevice::ReadOnly)) + { + LERROR << "cannot open file" << fileName; + return CANNOT_OPEN_FILE_ERROR; + } + int sentenceNb=1; + std::map sentences; + while (!file.atEnd()) + { + QByteArray text=file.readLine(); + QString textString = QString::fromUtf8(text.constData()); + //One assume that the input file does not start with a blank line + if (textString.size()<3) + { + sentenceNb++; + } + else + { + QString becomingSentence=sentences[sentenceNb]+textString; + sentences[sentenceNb]= becomingSentence; + } + } + + ConllHandler cHandler(m_d->m_language, analysis, tokenList); + for (std::map::iterator it=sentences.begin(); it!=sentences.end(); ++it) + { + int sentenceIndex=it->first; + QString sentence=it->second; + if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping, sentence)) + { +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process there is/are " << cHandler.m_verbalClassNb << "verbal class(es) for this sentence " ; +#endif + for (int vClassIndex=0;vClassIndexm_model + "." + verbalClass; + } + LimaString verbalClass= verbalClasses.join("|"); + AnnotationGraphVertex annotPredicateVertex=annotationData->createAnnotationVertex(); + annotationData->addMatching("PosGraph", posGraphPredicateVertex, "annot", annotPredicateVertex); + annotationData->annotate(annotPredicateVertex, "Predicate", verbalClass); + + +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process: annotation vertex"<< annotPredicateVertex <<"was created for the verbal class "<< annotationData->stringAnnotation(annotPredicateVertex, "Predicate") << "and the PoS graph vertex"<>::iterator semRoleIt; + for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ + LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; + + QStringList semanticRoles = (*semRoleIt).second.split("|"); + //for (QString& semanticRole: semanticRoles) + // Modif NAN compatibilité de compilation + for (QStringList::iterator it=semanticRoles.begin(); it!=semanticRoles.end(); ++it) + { + QString& semanticRole = *it; + if (!semanticRole.isEmpty()) + semanticRole = m_d->m_model + "." + semanticRole; + } + LimaString semanticRole= semanticRoles.join("|"); + AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); + AnnotationGraphEdge roleEdge=annotationData->createAnnotationEdge(annotPredicateVertex, annotRoleVertex); + annotationData->annotate(roleEdge, "SemanticRole", semanticRole); + annotationData->addMatching("PosGraph", posGraphRoleVertex, "annot", annotRoleVertex); + + +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process: annotation edge" << roleEdge << "annotated " << annotationData->stringAnnotation(roleEdge, "SemanticRole")<< "was created for" << verbalClass << " and the PoS graph vertices " << posGraphPredicateVertex << "and" << posGraphRoleVertex ; +#endif + } + } + } + } + return SUCCESS_ID; +} + + + +ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph): +m_language(language), +m_analysis(analysis), +m_graph(graph), +m_descriptorSeparator(CONLLFIELDSEPARATOR), +m_tokenSeparator(CONLLTOKENSEPARATOR), +m_verbalClasses(), +m_semanticRoles(), +m_verbalClassNb() +{ +} +ConllHandler::~ConllHandler() +{ +} +// designed to be repeated on each sentence +bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMapping* limaConllMapping, const QString & sent) +{ + SEMANTICANALYSISLOGINIT; + ConllHandler cHandler(m_language, m_analysis, m_graph); + QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); + if (sentenceTokens.isEmpty()) + { + return false; + } + QString firstSentenceToken=(*sentenceTokens.constBegin()); + int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); + m_verbalClassNb = descriptorsNb - NBCOLSINSRLBEFOREFRAME - 1; + int classIndex=0; + if (m_verbalClassNb > 0) + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation" << m_verbalClassNb << sentenceI << " : \n" << sent ; +#endif + m_verbalClasses.clear(); + m_verbalClasses.resize(m_verbalClassNb); + m_semanticRoles.clear(); + m_semanticRoles.resize(m_verbalClassNb); + //repeated on each token of the sentence, that is on each line +#ifdef ANTINNO_SPECIFIC +BOOST_FOREACH (const auto & token, sentenceTokens) +#else + for (const auto & token: sentenceTokens) +#endif + { + int roleNumbers=0; + QStringList descriptors=cHandler.splitSegment(token,m_descriptorSeparator); + if (descriptors.size()>=NBCOLSINSRLBEFOREFRAME+m_verbalClassNb) + { + int conllTokenId=descriptors[0].toInt(); + QString conllToken=descriptors[1]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation token " << conllTokenId << conllToken; +#endif + if(descriptors[NBCOLSINSRLBEFOREFRAME]!="_") + { + QString verbalClass=descriptors[NBCOLSINSRLBEFOREFRAME]; + QString vClass=descriptors[NBCOLSINSRLBEFOREFRAME]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation verbalClass" << vClass; +#endif + LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); + if (classIndex >= m_verbalClasses.size()) + { + LERROR << "ConllHandler::extractSemanticInformation classIndex error" << classIndex; + break; + } + m_verbalClasses[classIndex]=qMakePair(limaTokenId, vClass); + classIndex++; + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH (auto roleTargetFieldIndex, boost::irange(0,m_verbalClassNb)) +#else for (auto roleTargetFieldIndex : boost::irange(0,m_verbalClassNb)) +#endif { #ifdef DEBUG_LP LDEBUG << "ConllHandler::extractSemanticInformation"<<"nb descriptors and roleTargetFieldIndex" << descriptors.size() << roleTargetFieldIndex ; @@ -373,7 +853,7 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap } } } - return classIndex; + return classIndex != 0; } @@ -410,3 +890,9 @@ LinguisticGraphVertex ConllHandler::getLimaTokenId(int conllTokenId, int sentenc } } // end namespace + + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp index 954a6eef7..0c3e34628 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp @@ -264,12 +264,13 @@ updateCurrentDate(AnalysisContent& analysis, unsigned short NormalizeDate::getDayFromString(const LimaString& numdayString) const { SELOGINIT; - // try first conversion of type "premier" -> 1 - unsigned short day = m_resources->getCardinalFromNumberOrdinal(numdayString); + // try to extract number as int from string like 4th, 22nd, 1st or like 17 + unsigned short day = m_resources->getValueFromNumberOrdinal(numdayString); LDEBUG << "NormalizeDate::getDayFromString: testConversion 1 of " << numdayString << "1 day=" << day; + // try first conversion of type "premier" -> 1 // then try conversion of type "10th" -> 10 if( day == NormalizeDateTimeResources::no_day ) { - day = m_resources->getDayNumberFromWordOrdinal(numdayString); + day = m_resources->getValueFromWordCardinalOrOrdinal(numdayString); LDEBUG << "NormalizeDate::getDayFromString: testConversion 2 of " << numdayString << "1 day=" << day; } // then try conversion of type "10" -> 10 @@ -450,13 +451,23 @@ operator()(RecognizerMatch& m, // set interval QDate firstDayOfMonth(year,month,1); #ifdef DEBUG_LP - LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << firstDayOfMonth; +#ifdef ANTINNO_SPECIFIC + // FWI 21/09/2015 modifi temporairement + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << "????"; +#else + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << firstDayOfMonth; +#endif #endif m.features().setFeature(DATE_BEGIN_FEATURE_NAME,firstDayOfMonth); if (month_end==0) { QDate date_end = firstDayOfMonth.addMonths(1).addDays(-1); #ifdef DEBUG_LP - LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << date_end; +#ifdef ANTINNO_SPECIFIC + // FWI 21/09/2015 modifi temporairement + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << "????"; +#else + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << date_end; +#endif #endif m.features().setFeature(DATE_END_FEATURE_NAME,date_end); } @@ -533,7 +544,9 @@ operator()(RecognizerMatch& m, m.features().setFeature(DATESTRING_FEATURE_NAME,m.getString()); } - QString dateSpan = QString::number(year); + QString dateSpan = "XXXX"; + if( year != 0 ) + dateSpan = QString::number(year); #ifdef DEBUG_LP LDEBUG << "NormalizeDate operator(): year: dateSpan=" << dateSpan; #endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp index c4582f499..f53d747af 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp @@ -29,6 +29,7 @@ #include "linguisticProcessing/client/LinguisticProcessingException.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "boost/algorithm/string/split.hpp" #include "boost/algorithm/string/classification.hpp" @@ -56,9 +57,13 @@ MONTHSDAYS_MONTH_ID=std::string("m"); const std::string NormalizeDateTimeResources:: MONTHSDAYS_DAY_ID=std::string("d"); const std::string NormalizeDateTimeResources:: -MONTHSDAYS_ORDINAL_ID=std::string("o"); +WORD_CARDINAL_ID=std::string("c"); const std::string NormalizeDateTimeResources:: -MONTHSDAYS_SUFFIX_ID=std::string("s"); +WORD_CARDINAL_SEPARATOR_ID=std::string("s"); +const std::string NormalizeDateTimeResources:: +WORD_ORDINAL_SUFFIX_ID=std::string("w"); +const std::string NormalizeDateTimeResources:: +NUMBER_ORDINAL_SUFFIX_ID=std::string("n"); NormalizeDateTimeResources::NormalizeDateTimeResources(): @@ -89,7 +94,7 @@ init(GroupConfigurationStructure& unitConfiguration, try { tzDbFile = unitConfiguration.getParamsValueAtKey("timezoneDatabase"); - tzDbFile = resourcesPath + "/" + tzDbFile; + tzDbFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), tzDbFile.c_str()).toUtf8().constData(); // m_timezoneDatabase = new boost::local_time::tz_database(); // m_timezoneDatabase->load_from_file(tzDbFile); } @@ -112,7 +117,7 @@ init(GroupConfigurationStructure& unitConfiguration, try { string monthsDaysFile = unitConfiguration.getParamsValueAtKey("monthsDays"); - monthsDaysFile = resourcesPath + "/" + monthsDaysFile; + monthsDaysFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), monthsDaysFile.c_str()).toUtf8().constData(); if (!readMonthDays(monthsDaysFile)) { SELOGINIT; LERROR << "Error loading monthsDays resources '" @@ -132,6 +137,7 @@ bool NormalizeDateTimeResources:: readMonthDays(const std::string& monthsDaysFile) { + m_wordCardinalSeparator[Common::Misc::utf8stdstring2limastring(" ")]=0; ifstream file(monthsDaysFile.c_str(), std::ifstream::binary); if (!file.good()) { return false; @@ -139,13 +145,13 @@ readMonthDays(const std::string& monthsDaysFile) string utf8line; LimaString line; while (file.good()) { - getline(file,utf8line); + utf8line = Lima::Common::Misc::readLine(file); if (!utf8line.empty()) { line=Common::Misc::utf8stdstring2limastring(utf8line); std::vector elements; split(elements,utf8line,is_any_of(MONTHSDAYS_MAIN_SEP)); - // three elements in line: (month|day|ordinal|suffix) num list,of,strings - if (elements.size()!=3) { + // three elements in line: (month|day|ordinal|cardinal|suffix) num list-of-strings + if (elements.size()!=3) { SELOGINIT; LWARN << "MonthsDaysResources: cannot parse line " << utf8line; continue; @@ -153,12 +159,14 @@ readMonthDays(const std::string& monthsDaysFile) map* names(0); if (elements[0] == MONTHSDAYS_MONTH_ID) { names=&m_months; } else if (elements[0] == MONTHSDAYS_DAY_ID) { names=&m_days; } - else if (elements[0] == MONTHSDAYS_ORDINAL_ID) { names=&m_ordinal; } - else if (elements[0] == MONTHSDAYS_SUFFIX_ID) { names=&m_ordinalSuffixes; } + else if (elements[0] == WORD_CARDINAL_SEPARATOR_ID) { names=&m_wordCardinalSeparator; } + else if (elements[0] == WORD_CARDINAL_ID) { names=&m_wordCardinal; } + else if (elements[0] == WORD_ORDINAL_SUFFIX_ID) { names=&m_wordOrdinalSuffixes; } + else if (elements[0] == NUMBER_ORDINAL_SUFFIX_ID) { names=&m_numberOrdinalSuffixes; } else { SELOGINIT; LWARN << "MonthsDaysResources: cannot parse line " << utf8line - << ": first element must be 'm' 'd', 'o' or 's'"; + << ": first element must be 'm' 'd', 'c', 'w', 'n' or 's'"; continue; } @@ -208,29 +216,87 @@ getDayNumber(const LimaString& dayName) const } unsigned short NormalizeDateTimeResources:: -getDayNumberFromWordOrdinal(const LimaString& dayName) const +getValueFromWordCardinalOrOrdinal(const LimaString& dayName) const { - map::const_iterator - it=m_ordinal.find(dayName); - if (it==m_ordinal.end()) { - return NormalizeDateTimeResources::no_day; + SELOGINIT; + unsigned short day(0); + // trim suffix first, second or th, or (me, ime, ieme, eme) + LimaString numberAsString(dayName); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal() numberAsString=" + << numberAsString; + map::const_iterator suffixIt=m_wordOrdinalSuffixes.begin(); + for( ; suffixIt!=m_wordOrdinalSuffixes.end() ; suffixIt++ ) + { + const LimaString& suffix = (*suffixIt).first; + int index = dayName.indexOf(suffix, 0, Qt::CaseInsensitive); + if (index >= 0) { + numberAsString = LimaString(dayName.constData(),index); + day += (*suffixIt).second; + break; + } } - return (*it).second; + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: after trim numberAsString=" + << numberAsString << ", day=" << day; + if( numberAsString.isEmpty() ) + return day; + // compute value from left to right + int parsingPosition(0); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: parsingPosition=" << parsingPosition; + for( ; ; ) + { + int index(-1); + // identify component of number + map::const_iterator cardinalIt=m_wordCardinal.begin(); + for( ; cardinalIt!=m_wordCardinal.end() ; cardinalIt++ ) + { + const LimaString& word = (*cardinalIt).first; + int index = numberAsString.indexOf(word, parsingPosition, Qt::CaseInsensitive); + if (index >= 0) { + day += (*cardinalIt).second; + parsingPosition += word.length(); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: found" + << word << ", day=" << day << ", parsingPosition=" << parsingPosition; + break; + } + } + // skip separator + int skipIndex(-1); + do + { + map::const_iterator separatorIt=m_wordCardinalSeparator.begin(); + for( ; separatorIt!=m_wordCardinalSeparator.end() ; separatorIt++ ) + { + const LimaString& separator = (*separatorIt).first; + int skipIndex = numberAsString.indexOf(separator, parsingPosition, Qt::CaseInsensitive); + if (skipIndex == 0) { + parsingPosition += separator.length(); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: found" + << separator << ", day=" << day << ", parsingPosition=" << parsingPosition; + break; + } + } + } while( skipIndex == 0 ); + if( index == -1 ) + break; + } + return day; } unsigned short NormalizeDateTimeResources:: -getCardinalFromNumberOrdinal(const LimaString& dayName) const +getValueFromNumberOrdinal(const LimaString& dayName) const { - // try to extract number as int from string - map::const_iterator it=m_ordinalSuffixes.begin(); - for( ; it!=m_ordinalSuffixes.end() ; it++ ) + // try to extract number as int from string like 4th, 22nd, 1st or 17 + map::const_iterator it=m_numberOrdinalSuffixes.begin(); + for( ; it!=m_numberOrdinalSuffixes.end() ; it++ ) { + // try to trim suffix th, nd, st or rd const LimaString& suffix = (*it).first; int index = dayName.indexOf(suffix, 0, Qt::CaseInsensitive); - if (index < 0) - continue; - LimaString numberAsString(dayName.constData(),index); + LimaString numberAsString(dayName); + if (index > 0) + numberAsString = LimaString(dayName.constData(),index); bool ok(false); + // try to convert trimmed string to int unsigned short day = numberAsString.toUShort(&ok); if( ok) return day; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h index bd66c209e..6ffd9bb6a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h @@ -59,8 +59,8 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR // const boost::local_time::tz_database& getTimezoneDatabase() const; unsigned short getMonthNumber(const LimaString& monthName) const; unsigned short getDayNumber(const LimaString& dayName) const; - unsigned short getCardinalFromNumberOrdinal(const LimaString& dayName) const; - unsigned short getDayNumberFromWordOrdinal(const LimaString& dayName) const; + unsigned short getValueFromWordCardinalOrOrdinal(const LimaString& dayName) const; + unsigned short getValueFromNumberOrdinal(const LimaString& dayName) const; static const unsigned short no_month=static_cast(-1); static const unsigned short no_day=static_cast(-1); @@ -70,8 +70,10 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR // boost::local_time::tz_database* m_timezoneDatabase; std::map m_months; std::map m_days; - std::map m_ordinal; - std::map m_ordinalSuffixes; + std::map m_wordCardinal; + std::map m_wordCardinalSeparator; + std::map m_wordOrdinalSuffixes; + std::map m_numberOrdinalSuffixes; // private member functions bool readMonthDays(const std::string& monthsDaysFile); @@ -81,8 +83,10 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR static const std::string MONTHSDAYS_NAMELIST_SEP; static const std::string MONTHSDAYS_MONTH_ID; static const std::string MONTHSDAYS_DAY_ID; - static const std::string MONTHSDAYS_ORDINAL_ID; - static const std::string MONTHSDAYS_SUFFIX_ID; + static const std::string WORD_CARDINAL_ID; + static const std::string WORD_CARDINAL_SEPARATOR_ID; + static const std::string WORD_ORDINAL_SUFFIX_ID; + static const std::string NUMBER_ORDINAL_SUFFIX_ID; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp index d3046e4f5..29ee9f736 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp @@ -55,11 +55,11 @@ namespace SpecificEntities { // factories for constraint functions defined in this file -ConstraintFunctionFactory -isASpecificEntityFactory(isASpecificEntityId); +ConstraintFunctionFactory + isAlphaPossessiveFactory(isAlphaPossessiveId); -ConstraintFunctionFactory - isInSameSpecificEntityFactory(isInSameSpecificEntityId); +ConstraintFunctionFactory + isASpecificEntityFactory(isASpecificEntityId); ConstraintFunctionFactory CreateSpecificEntityFactory(CreateSpecificEntityId); @@ -67,6 +67,9 @@ ConstraintFunctionFactory ConstraintFunctionFactory SetEntityFeatureFactory(SetEntityFeatureId); +ConstraintFunctionFactory + AddEntityFeatureAsEntityFactory(AddEntityFeatureAsEntityId); + ConstraintFunctionFactory AddEntityFeatureFactory(AddEntityFeatureId); @@ -80,6 +83,25 @@ ConstraintFunctionFactory NormalizeEntityFactory(NormalizeEntityId); +isAlphaPossessive:: +isAlphaPossessive(MediaId language, + const LimaString& complement): +ConstraintFunction(language,complement) +{ +} + +bool isAlphaPossessive::operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v, + AnalysisContent& /*analysis*/) const +{ + LinguisticGraph* lingGraph = const_cast(graph.getGraph()); +// Token* token=get(vertex_token,*(graph.getGraph()),v); + VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); + const TStatus& status = tokenMap[v]->status(); + return( status.isAlphaPossessive() ); +} + + isASpecificEntity:: isASpecificEntity(MediaId language, const LimaString& complement): @@ -142,103 +164,6 @@ bool isASpecificEntity::operator()(const LinguisticAnalysisStructure::AnalysisGr return false; } -isInSameSpecificEntity:: - isInSameSpecificEntity(MediaId language, - const LimaString& complement): - ConstraintFunction(language,complement), - m_type() -{ - if (! complement.isEmpty()) { - m_type=Common::MediaticData::MediaticData::single().getEntityType(complement); - } -} - -/** @brief Tests if the two given vertices are in the same specific entity - * - * There is several cases: - * - va1 and va2 are SE vertices : true iff va1 == va2 - * - va1 and va2 are standard vertices : true iff there is an outgoing edge - * in the annotation graph annotated with "belongstose" from each of them - * and toward the same vertex - * - va1 (va2) is a SE vertex and there is an outgoing edge in the annotation - * graph annotated with "belongstose" from va2 (va1) to va1 (va2). - * - * In all the cases, va1 and va2 are the uniq "morphannot" matches of v1 and v2 - * - * @note This method handles only the first level of SE: if a SE is recursively - * included in a second one, morph vertices from the first one and from the - * the second one (not in the first one) will NOT be considered as being in the - * same specific entity. - * @note It is considered that a morph vertex can be directly in only one SE. - * So, its annotation vertex will have at most one "belongstose" annotated - * outgoing edge. - */ -bool isInSameSpecificEntity::operator()( - const LinguisticAnalysisStructure::AnalysisGraph& /*graph*/, - const LinguisticGraphVertex& v1, - const LinguisticGraphVertex& v2, - AnalysisContent& analysis) const -{ - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - AnnotationGraphVertex va1 = *(annotationData->matches(recoData->getGraphId(), v1, "annot").begin()); - AnnotationGraphVertex va2 = *(annotationData->matches(recoData->getGraphId(), v2, "annot").begin()); - - if ( (va1 == va2) && annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity")) ) - { // first case - return true; - } - AnnotationGraphVertex vase = std::numeric_limits::max(); - AnnotationGraphVertex va = std::numeric_limits::max(); - if (annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - vase = va1; - va = va2; - } - else if (annotationData->hasAnnotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - vase = va2; - va = va1; - } - if (vase == std::numeric_limits::max()) - { // second case - AnnotationGraphOutEdgeIt it1, it1_end; - AnnotationGraphVertex se1 = std::numeric_limits::max(); - boost::tie(it1, it1_end) = out_edges(va1, annotationData->getGraph()); - for (; it1 != it1_end; it1++) - { - if ( annotationData->intAnnotation((*it1), Common::Misc::utf8stdstring2limastring("belongstose"))==1) - { - se1 = target(*it1, annotationData->getGraph()); - break; - } - } - if (se1 == std::numeric_limits::max()) - { - return false; - } - AnnotationGraphVertex se2 = std::numeric_limits::max(); - AnnotationGraphOutEdgeIt it2, it2_end; - boost::tie(it2, it2_end) = out_edges(va2, annotationData->getGraph()); - for (; it2 != it2_end; it2++) - { - if ( annotationData->intAnnotation((*it2), Common::Misc::utf8stdstring2limastring("belongstose"))==1) - { - se2 = target(*it2, annotationData->getGraph()); - break; - } - } - return (se1 == se2); - } - else - { // third case - bool ok; AnnotationGraphEdge e; - boost::tie(e, ok) = edge(va,vase,annotationData->getGraph()); - return (ok && (annotationData->intAnnotation(e, Common::Misc::utf8stdstring2limastring("belongstose"))==1)); - } -} - - CreateSpecificEntity::CreateSpecificEntity(MediaId language, const LimaString& complement): @@ -423,7 +348,18 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, annotationData->dumpFunction("SpecificEntity", new DumpSpecificEntityAnnotation()); } - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + AnalysisData* rdata=analysis.getData("RecognizerData"); + if (rdata==0) { + SELOGINIT; + LERROR << "CreateSpecificEntity: missing data RecognizerData: entity will not be created"; + return false; + } + RecognizerData* recoData=static_cast(rdata); + if (recoData==0) { + SELOGINIT; + LERROR << "CreateSpecificEntity: missing data RecognizerData: entity will not be created"; + return false; + } std::string graphId=recoData->getGraphId(); // LDEBUG << " match is " << match; @@ -443,7 +379,14 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); VertexDataPropertyMap dataMap = get(vertex_data, *lingGraph); - const MorphoSyntacticData* dataHead = dataMap[annot.getHead()]; + LinguisticGraphVertex head = annot.getHead(); + if( head == 0 ) { + // take status of last element in match for eng + head = v2; + // or take status of first element in match (in fre?) + // head = v1; + } + const MorphoSyntacticData* dataHead = dataMap[head]; // Preparer le Token et le MorphoSyntacticData pour le nouveau noeud. Construits // a partir des infos de l'entitee nommee @@ -475,7 +418,7 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, LDEBUG << "CreateSpecificEntity, use micros from config file "; #endif // use micros given in the config file : get the specific resource - // (specific to modex) + // (specific to modex) AddEntityFeature // WARN : some hard coded stuff here in resource names EntityType seType=match.getType(); if (seType.getGroupId() == 0) @@ -522,11 +465,13 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, match.positionBegin(), match.length()); - // always take status from first element in match - //if (match.size() == 1) - //{ - newToken->setStatus(tokenMap[v1]->status()); - //} + // take posessive tstatus from head + TStatus tStatus = tokenMap[head]->status(); + const TStatus& headTStatus = tokenMap[v2]->status(); + if(headTStatus.isAlphaPossessive()) { + tStatus.setAlphaPossessive(true); + } + newToken->setStatus(tStatus); if (newMorphData->empty()) { @@ -577,8 +522,14 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, } else { - AnnotationGraphVertex src = *(matches.begin()); - annotationData->annotate( src, agv, Common::Misc::utf8stdstring2limastring("belongstose"), 1); + if( recoData->hasVertexAsEmbededEntity((*matchIt).m_elem.first) ) + { +#ifdef DEBUG_LP + LDEBUG << "CreateSpecificEntity::operator(): vertex " << *(matches.begin()) << " is embeded"; +#endif + AnnotationGraphVertex src = *(matches.begin()); + annotationData->annotate( agv, src, Common::Misc::utf8stdstring2limastring("holds"), 1); + } } } @@ -902,9 +853,11 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& vertex, AnalysisContent& analysis) const { +#ifdef DEBUG_LP SELOGINIT; LDEBUG << "SetEntityFeature:: (one argument) start... "; LDEBUG << "SetEntityFeature::(feature:" << m_featureName << ", vertex:" << vertex << ")"; +#endif // get RecognizerData: the data in which the features are stored RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData==0) { @@ -924,7 +877,9 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, } switch (m_featureType) { case QVariant::String: +#ifdef DEBUG_LP LDEBUG << "SetEntityFeature:: recoData->setEntityFeature(feature:" << m_featureName << ", featureValue:" << featureValue<< ")"; +#endif recoData->setEntityFeature(m_featureName,featureValue); break; case QVariant::Int: @@ -957,15 +912,18 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& analysis) const { +#ifdef DEBUG_LP SELOGINIT; // LERROR << "SetEntityFeature:: Error: version with two vertices parameters is not implemented"; // return false; LDEBUG << "SetEntityFeature:: (two arguments) start... "; LDEBUG << "SetEntityFeature::(feature:" << m_featureName << ", v1:" << v1 << ", v2:" << v2 << ")"; +#endif // get RecognizerData: the data in which the features are stored RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData==0) { + SELOGINIT; LERROR << "SetEntityFeature:: Error: missing RecognizerData"; return false; } @@ -1000,7 +958,8 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, } } } - if( nbEdges > 1 ) { + if( nbEdges > 1 ) { + SELOGINIT; LWARN << "SetEntityFeature:: Warning: ambiguïties in graph"; } @@ -1041,6 +1000,54 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, return true; } +//---------------------------------------------------------------------------------------- +// AddEntityFeatureAsEntity : assert the the vertex is a named entity. +// Add it to the list of components as an embeded entity (the list is used to create the link +// "holds" between the annotation of the embeded and the embedding entity. +// Remember the embedding entity is no yet created. + +AddEntityFeatureAsEntity::AddEntityFeatureAsEntity(MediaId language, + const LimaString& complement): +ConstraintFunction(language,complement), +m_featureName(""), +m_featureType(QVariant::UserType) +{ + if (complement.size()) { + QStringList complementElements = complement.split(":"); + m_featureName=complementElements.front().toUtf8().constData(); + complementElements.pop_front(); + if (!complementElements.empty()) { +#ifdef DEBUG_LP + SELOGINIT; + LERROR << "AddEntityFeatureAsEntity::AddEntityFeatureAsEntity(): no type specification authorized for the feature (" + << complementElements << ") the feature type is the type of the entity"; +#endif + } + } +} + +bool AddEntityFeatureAsEntity:: +operator()(const LinguisticAnalysisStructure::AnalysisGraph& /* unused graph */, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis) const +{ +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "AddEntityFeatureAsEntity:: (one argument) start... "; + LDEBUG << "AddEntityFeatureAsEntity::(feature:" << m_featureName << ", vertex:" << vertex << ")"; +#endif + // get RecognizerData: the data in which the features are stored + RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + if (recoData==0) { + SELOGINIT; + LERROR << "AddEntityFeatureAsEntity:: Error: missing RecognizerData"; + return false; + } + // add the vertex to the list of embeded named entities + recoData->addVertexAsEmbededEntity(vertex); + return true; +} + //---------------------------------------------------------------------------------------- // AddEntityFeature : add a value for a given feature to the recognized entity // we do not have direct access to the RecognizerMatch of the entity when calling this function @@ -1420,8 +1427,20 @@ SELOGINIT; LERROR << "NormalizeEntity:: Error: missing RecognizerData"; return false; } -// assign stored features to RecognizerMatch features -match.features()=recoData->getEntityFeatures(); +// assign stored features to RecognizerMatch features (preserving DEFAULT_ATTIBUTE) +//match.features()=recoData->getEntityFeatures(); +#ifdef ANTINNO_SPECIFIC +Q_FOREACH (const auto& f, recoData->getEntityFeatures()) { +#else +for (const auto& f: recoData->getEntityFeatures()) { +#endif + match.features().addFeature(f.getName(),f.getValue()); + EntityFeatures::iterator featureIt = match.features().findLast(f.getName()); + if( f.getPosition() != UNDEFPOSITION ) { + (*featureIt).setPosition(f.getPosition()); + (*featureIt).setLength(f.getLength()); + } +} // must clear the stored features, once they are used (otherwise, will be kept for next entity) recoData->clearEntityFeatures(); return true; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h index 31e8c26a0..a4a4dec8a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h @@ -33,10 +33,11 @@ namespace LinguisticProcessing { namespace SpecificEntities { // ids for constraints in this file +#define isAlphaPossessiveId "isAlphaPossessive" #define isASpecificEntityId "isASpecificEntity" -#define isInSameSpecificEntityId "isInSameSpecificEntity" #define CreateSpecificEntityId "CreateSpecificEntity" #define SetEntityFeatureId "SetEntityFeature" +#define AddEntityFeatureAsEntityId "AddEntityFeatureAsEntity" #define AddEntityFeatureId "AddEntityFeature" #define AppendEntityFeatureId "AppendEntityFeature" #define ClearEntityFeaturesId "ClearEntityFeatures" @@ -45,44 +46,27 @@ namespace SpecificEntities { /** @author Benoit Mathieu */ -class LIMA_SPECIFICENTITIES_EXPORT isASpecificEntity : public Automaton::ConstraintFunction +class LIMA_SPECIFICENTITIES_EXPORT isAlphaPossessive : public Automaton::ConstraintFunction { public: - isASpecificEntity(MediaId language, + isAlphaPossessive(MediaId language, const LimaString& complement=LimaString()); - ~isASpecificEntity() {} + ~isAlphaPossessive() {} bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v, AnalysisContent& analysis) const; - -private: - Common::MediaticData::EntityType m_type; }; -class LIMA_SPECIFICENTITIES_EXPORT isInSameSpecificEntity : public Automaton::ConstraintFunction +class LIMA_SPECIFICENTITIES_EXPORT isASpecificEntity : public Automaton::ConstraintFunction { public: - isInSameSpecificEntity(MediaId language, - const LimaString& complement=LimaString()); - ~isInSameSpecificEntity() {} - - /** @brief Tests if the two given vertices are in the same specific entity - * - * There is several cases: - * - va1 and va2 are SE vertices : true iff va1 == va2 - * - va1 and va2 are standard vertices : true iff there is an outgoing edge in - * the annotation graph annotated with "belongstose" from each of them and - * toward the same vertex - * - va1 (va2) is a SE vertex and there is an outgoing edge in the annotation - * graph annotated with "belongstose" from va2 (va1) to va1 (va2). - * - * In all the cases, va1 and va2 are the uniq "morphannot" matches of v1 and v2 - */ + isASpecificEntity(MediaId language, + const LimaString& complement=LimaString()); + ~isASpecificEntity() {} bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& v1, - const LinguisticGraphVertex& v2, + const LinguisticGraphVertex& v, AnalysisContent& analysis) const; - + private: Common::MediaticData::EntityType m_type; }; @@ -166,6 +150,27 @@ class LIMA_SPECIFICENTITIES_EXPORT SetEntityFeature : public Automaton::Constrai QVariant::Type m_featureType; }; +/** + * @brief This action add a vertex as an embeded entity + * of the entity (i.e. during the rule matching process). + * + */ +class LIMA_SPECIFICENTITIES_EXPORT AddEntityFeatureAsEntity : public Automaton::ConstraintFunction +{ +public: + AddEntityFeatureAsEntity(MediaId language, + const LimaString& complement=LimaString()); + ~AddEntityFeatureAsEntity() {} + bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis) const; + +private: + std::string m_featureName; + Common::MediaticData::EntityType m_type; + QVariant::Type m_featureType; +}; + /** * @brief This action set the value of a feature for an entity during the recognition * of the entity (i.e. during the rule matching process). diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp index 29bd807d5..ee00ad454 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp @@ -28,6 +28,7 @@ #include "SpecificEntitiesLoader.h" #include "SpecificEntitiesConstraints.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "linguisticProcessing/core/Automaton/recognizerMatch.h" #include "linguisticProcessing/core/Automaton/recognizerData.h" @@ -77,8 +78,8 @@ init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfigurati deque modex=unitConfiguration.getListsValueAtKey("modex"); for (deque::const_iterator it=modex.begin(),it_end=modex.end();it!=it_end;it++) { LDEBUG << "loader: initialize modex " << *it; - string filename=Common::MediaticData::MediaticData::single().getConfigPath()+"/"+*it; - Common::XMLConfigurationFiles::XMLConfigurationFileParser parser(filename); + QString filename = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),(*it).c_str()); + Common::XMLConfigurationFiles::XMLConfigurationFileParser parser(filename.toUtf8().constData()); Common::MediaticData::MediaticData::changeable().initEntityTypes(parser); } } @@ -114,7 +115,11 @@ process(AnalysisContent& analysis) const SpecificEntitiesLoader::XMLHandler handler(m_language,analysis,graph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); +#ifdef ANTINNO_SPECIFIC QFile file(getInputFile(analysis).c_str()); +#else + QFile file(getInputFile(analysis)); +#endif if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp index 6adb06b10..903013dc8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp @@ -64,23 +64,23 @@ init(GroupConfigurationStructure& unitConfiguration, MediaId language=manager->getInitializationParameters().language; const PropertyManager& microManager = static_cast(MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO"); - const map >& entities= - unitConfiguration.getLists(); + const map >& entities = unitConfiguration.getLists(); + #ifdef DEBUG_LP + LDEBUG << "entities.size() " << entities.size(); + #endif - for (map >::const_iterator it=entities.begin(), - it_end=entities.end(); it!=it_end; it++) { + for (auto it=entities.begin(), it_end=entities.end(); it!=it_end; it++) { LimaString entityName=Common::Misc::utf8stdstring2limastring((*it).first); #ifdef DEBUG_LP LDEBUG << "Adding categories to entity " << entityName; #endif try { EntityType type=static_cast(MediaticData::single()).getEntityType(entityName); - for (deque::const_iterator micro=(*it).second.begin(), - micro_end=(*it).second.end(); micro!=micro_end; micro++) { + for (auto micro=(*it).second.begin(), micro_end=(*it).second.end(); micro!=micro_end; micro++) { LinguisticCode code = microManager.getPropertyValue(*micro); if (code == 0) { SELOGINIT; - LERROR << "SpecificEntitiesMicros::init on entity" << entityName << "," << *micro << "linguistic code is not defined"; + LERROR << "SpecificEntitiesMicros::init on entity" << entityName << "," << *micro << "linguistic code is not defined for language" << MediaticData::single().getMediaId(language); } else { #ifdef DEBUG_LP diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp index aebad4304..19901421e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp @@ -183,7 +183,6 @@ LimaStatusCode SpecificEntitiesRecognizer::process( LinguisticGraph* graph=anagraph->getGraph(); std::queue toVisit; VertexTokenPropertyMap tokenMap=get(vertex_token,*graph); - VertexDataPropertyMap DataMap=get(vertex_data,*graph); std::set visited; try diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp index 36ed408d0..2bbbd1162 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp @@ -215,7 +215,7 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( } const SpecificEntityAnnotation* annot=getSpecificEntityAnnotation(v,annotationData); if (annot != 0) { - outputEntity(out,v,annot,tokenMap,offset); + outputEntity(annotationData,out,v,annot,tokenMap,offset); } } } @@ -251,7 +251,7 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( continue; } v = annotationData->intAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph)); - outputEntity(out,v,annot,tokenMap,offset); + outputEntity(annotationData,out,v,annot,tokenMap,offset); } } } @@ -270,18 +270,20 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( } void SpecificEntitiesXmlLogger:: -outputEntity(std::ostream& out, - LinguisticGraphVertex v, - const SpecificEntityAnnotation* annot, - const VertexTokenPropertyMap& tokenMap, - uint64_t offset) const +outputEntity( AnnotationData* annotationData, + std::ostream& out, + LinguisticGraphVertex v, + const SpecificEntityAnnotation* annot, + const VertexTokenPropertyMap& tokenMap, + uint64_t offset) const { LinguisticAnalysisStructure::Token* vToken = tokenMap[v]; // LDEBUG << "SpecificEntitiesXmlLogger tokenMap[" << v << "] = " << vToken; if (vToken == 0) { SELOGINIT; - LERROR << "Vertex " << v << " has no entry in the analysis graph token map. This should not happen !!"; + LERROR << "SpecificEntitiesXmlLogger::outputEntity: Vertex " << v + << " has no entry in the analysis graph token map. This should not happen !!"; } else { @@ -307,13 +309,61 @@ outputEntity(std::ostream& out, featureItr!=features_end; featureItr++) { if( featureItr->getPosition() != UNDEFPOSITION ) { - out << "<" << featureItr->getName(); + out << "<" << featureItr->getName(); out << " pos=\"" << featureItr->getPosition() << "\""; out << " len=\"" << featureItr->getLength() << "\""; - out << ">"; - out << Common::Misc::limastring2utf8stdstring(Common::Misc::transcodeToXmlEntities(Common::Misc::utf8stdstring2limastring(featureItr->getValueString()))) - << "getName() << ">"; - } + out << ">"; + out << Common::Misc::limastring2utf8stdstring(Common::Misc::transcodeToXmlEntities(Common::Misc::utf8stdstring2limastring(featureItr->getValueString()))) + << "getName() << ">"; + } + } + + // TODO: Follow "belongstose" links to outputs embeded entities as components + // Get the current annotationVertex (is there any more simple solution???) + std::set< AnnotationGraphVertex > matches = annotationData->matches(m_graph,v,"annot"); + AnnotationGraphVertex va1; + std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + for( ; it != matches.end(); it++) + { + va1=*it; + SELOGINIT; + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: get agv = " << va1; + if (annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + break; + } + if( it == matches.end() ) + { + SELOGINIT; + LERROR << "SpecificEntitiesXmlLogger::outputEntity: could not find annotation of node " << v << "in LinguisticGraph"; + } + else + { + SELOGINIT; + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: agv " << va1 << " is a SpecificEntity Annotation"; + // Follow "belongstose" out_edges to get annotationVertex of embededed NE + AnnotationGraphOutEdgeIt it1, it1_end; + boost::tie(it1, it1_end) = boost::out_edges(va1, annotationData->getGraph()); + for (; it1 != it1_end; it1++) + { + if ( annotationData->intAnnotation((*it1), Common::Misc::utf8stdstring2limastring("holds"))==1) + { + AnnotationGraphVertex va2; + va2 = target(*it1, annotationData->getGraph()); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: embeded agv = " << va2; + // rcuprer le noeud du graphe linguistique + LinguisticGraphVertex v2 = annotationData->intAnnotation(va2, Common::Misc::utf8stdstring2limastring(m_graph)); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: vertex in " << m_graph << " is " << v2; + // rcuprer l'annotation SpecifiEntity + if (annotationData->hasAnnotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + SpecificEntityAnnotation* annot2 = + annotationData->annotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity")).pointerValue(); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: annot2 = " << annot2; + outputEntity(annotationData,out, v2, annot2, tokenMap, offset); + break; + } + } + } } out << "" << ""; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h index 6a9e25dbc..a91accf65 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h @@ -65,7 +65,8 @@ class LIMA_SPECIFICENTITIES_EXPORT SpecificEntitiesXmlLogger : public AbstractTe const SpecificEntityAnnotation* getSpecificEntityAnnotation(LinguisticGraphVertex v, const Common::AnnotationGraphs::AnnotationData* annotationData) const; - void outputEntity(std::ostream& out, + void outputEntity(Common::AnnotationGraphs::AnnotationData* annotationData, + std::ostream& out, LinguisticGraphVertex v, const SpecificEntityAnnotation* annot, const VertexTokenPropertyMap& tokenMap, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp index e62545eae..509091067 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp @@ -350,7 +350,7 @@ void DotDependencyGraphWriter::write_graphviz( } os << v << " -> " << next << " "; - LDEBUG << "PosTaggingDepGraphEdgeWriter for "< " << next; + LTRACE << "PosTaggingDepGraphEdgeWriter for "< " << next; PosTaggingDepGraphEdgeWriter(&lposgraph,m_language,depGraph,syntacticData)(os,*outItr); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp index 8de299724..401495fae 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp @@ -164,13 +164,10 @@ bool SecondUngovernedBy::operator()( const LinguisticGraphVertex& v2, AnalysisContent& analysis ) const { -/* - Critical Function : comment logging messages -*/ -// SAPLOGINIT; -// LDEBUG << "testing SecondUngovernedBy for " -// << v1 << " and " << v2 -// << " with relation: " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SecondUngovernedBy for " << v1 << " and " << v2 << " with relation: " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -223,12 +220,10 @@ bool GovernorOf::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v1, AnalysisContent& analysis) const { -/* - Critical function : comment logging messages -*/ -// SAPLOGINIT; -// LDEBUG << "testing GovernorOf for " << v1 -// << " with relation : " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing GovernorOf for " << v1 << " with relation : " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -266,12 +261,10 @@ bool GovernedBy::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v1, AnalysisContent& analysis) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing GovernedBy for " << v1 -// << " with relation: " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing GovernedBy for " << v1 << " with relation: " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); if (v1 == graph.firstVertex() || v1 == graph.lastVertex() ) { @@ -310,12 +303,10 @@ bool SameNominalChain::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& /*ac*/) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing SameNominalChain for " << v1 << " and " << v2 -// ; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SameNominalChain for " << v1 << " and " << v2; +#endif CVertexChainIdPropertyMap map = get(vertex_chain_id, *(graph.getGraph())); VertexChainIdProp::const_iterator it1 = map[v1].begin(); @@ -356,13 +347,10 @@ bool SameVerbalChain::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& /*ac*/) const { -/* - Critical function : comment logging message -*/ - // return graph.SameVerbalChain(v1, v2, false); - -// SAPLOGINIT; -// LDEBUG << "testing SameVerbalChain for " << v1 << " and " << v2; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SameVerbalChain for " << v1 << " and " << v2; +#endif CVertexChainIdPropertyMap map = get(vertex_chain_id, *(graph.getGraph())); VertexChainIdProp::const_iterator it1 = map[v1].begin(); VertexChainIdProp::const_iterator it1_end = map[v1].end(); @@ -406,12 +394,11 @@ bool CreateRelationBetween::operator()(const AnalysisGraph&, const LinguisticGraphVertex& v2, AnalysisContent& analysis ) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing CreateRelationBetween for " << v1 << " and " -// << v2 << " with relation: " << static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(m_relation); +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing CreateRelationBetween for " << v1 << " and " + << v2 << " with relation: " << static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(m_relation); +#endif SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); bool res = syntacticData->relation(v1, v2, m_relation); // LDEBUG << "CreateRelationBetween: " << (res?"yes":"no"); @@ -777,7 +764,7 @@ bool CreateRelationReverseWithRelated::operator()( //********************************************************************** // complement contains symbols for category and microcategory -// (e.g.: L_NC;L_NC_GEN;) +// (e.g.: NC;NC_GEN;) CreateCompoundTense::CreateCompoundTense(MediaId language, const LimaString& complement): ConstraintFunction(language,complement), @@ -798,7 +785,13 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); +#ifdef ANTINNO_SPECIFIC + // Attention, si on passe aux, il faut modifier mm common de la langue en conséquence + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); +#else + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); +#endif + #ifdef DEBUG_LP LDEBUG << "CreateCompoundTense::CreateCompoundTense() m_tempCompType" << m_tempCompType; #endif @@ -817,13 +810,9 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, const LinguisticGraphVertex& auxVertex, AnalysisContent& analysis ) const { -/* - Critical function : comment logging message -*/ #ifdef DEBUG_LP SAPLOGINIT; - LDEBUG << "creating compound tense for " << auxVertex << " and " - << pastPartVertex; + LDEBUG << "creating compound tense for " << auxVertex << " and " << pastPartVertex; #endif SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -899,9 +888,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, // creer un MorphoSyntacticData #ifdef DEBUG_LP - LDEBUG << "Creating a DicoWord: " - << int(m_macro) << " / " << Common::Misc::limastring2utf8stdstring(verbFlex) << " / " - << int(m_micro) << " / " << verbLemma; + LDEBUG << "Creating a DicoWord: " << m_macro << " / " << verbFlex << " / " << m_micro << " / " << verbLemma; #endif MorphoSyntacticData* dataNewVerb = new MorphoSyntacticData(); /// if the anagraph is not set to delete the morphosyntactic data, we have to do it @@ -1110,7 +1097,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, } // copier vers le noeud du nouveau verbe toutes les relations de - // dépendance (sauf TEMPCOMP) qui avaient pour source ou destination + // dépendance (sauf aux) qui avaient pour source ou destination // l'auxiliaire ou le participe passé EdgeDepRelTypePropertyMap edgeTypeMap = get( edge_deprel_type, depGraph); @@ -1248,7 +1235,7 @@ CreateEasyCompoundTense::CreateEasyCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); m_microAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h index efc33adf4..9bc25fba8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h @@ -332,7 +332,7 @@ class LIMA_SYNTACTICANALYSIS_EXPORT CopyIncomingRelationsTo : public Automaton:: QStringList m_relations; }; -/** @brief This constraint creates a TEMPCOMP relation between its two +/** @brief This constraint creates a aux relation between its two * parameters * * @todo It was originaly supposed to replace the two vertices (auxiliary and diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp index 4e70d54a8..fbaffe833 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp @@ -27,6 +27,7 @@ */ #include "SelectionalPreferences.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" @@ -66,7 +67,7 @@ void SelectionalPreferences::init( try { std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string preferencesFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("file"); + std::string preferencesFileName = Common::Misc::findFileInPaths(resourcePath.c_str(), unitConfiguration.getParamsValueAtKey("file").c_str()).toUtf8().constData(); loadFromFile(preferencesFileName); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) @@ -93,8 +94,7 @@ void SelectionalPreferences::loadFromFile(const std::string& fileName) return; } - std::string line; - getline(ifl, line); + std::string line = Lima::Common::Misc::readLine(ifl); Common::Misc::chomp(line); linesCounter++; while (ifl.good() && !ifl.eof()) @@ -158,7 +158,7 @@ void SelectionalPreferences::loadFromFile(const std::string& fileName) boost::tuple< std::string, LinguisticCode, std::string, std::string, LinguisticCode > tuple(target,targetMacro,dependency,source,soureceMacro); m_preferences.insert(std::make_pair(tuple, probability)); } - getline(ifl, line); + line = Lima::Common::Misc::readLine(ifl); Common::Misc::chomp(line); linesCounter++; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp index 89781b47a..e1d6a6515 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp @@ -169,7 +169,12 @@ bool DisambiguateWith::operator()(const AnalysisGraph& graph, { SAPLOGINIT; LERROR << "no graph 'PosGraph' available !"; +#ifdef ANTINNO_SPECIFIC + // FWI 26/06/2016 doit retourner un boolen + return false; + #else return MISSING_DATA; +#endif } LinguisticGraph* lingGraph = const_cast(posgraph->getGraph()); // LDEBUG << "There is " << out_degree(v2, *lingGraph) << " edges out of " << v2; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp index e11c90c24..246daed2c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp @@ -394,7 +394,7 @@ displayRelationsDistanceOfArguments(const SyntacticData& data, } else { // can be equal (l', n'...) or - // negative (TEMPCOMP -> length of auxiliary has changed) + // negative (aux -> length of auxiliary has changed) out << 0; } out << endl; @@ -467,7 +467,7 @@ void SyntacticAnalysisTools::displayRelationsXMLFormat(const SyntacticData& data } else { // can be equal (l', n'...) or - // negative (TEMPCOMP -> length of auxiliary has changed) + // negative (aux -> length of auxiliary has changed) pathDistance = 0; } xmlStream << "" diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp index c93efcf9d..e3efd6b05 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp @@ -1,771 +1,863 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/** - * - * @file SyntacticAnalyzer-chains.cpp - * @author Gael de Chalendar (Gael.de-Chalendar@cea.fr) - - * Copyright (c) 2003 by CEA - * @date Created on Aug, 31 2004 - * @version $Id$ - * - */ - -#include "SyntacticAnalyzer-chains.h" -#include "SyntagmaticMatrix.h" - -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" -#include "common/time/timeUtilsController.h" -#include "common/LimaCommon.h" - -#undef min -#undef max - -using namespace std; -//using namespace boost; -using namespace Lima::Common::MediaticData; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace SyntacticAnalysis -{ - -static const uint64_t DEFAULT_MAXCHAINSNBBYVERTEX = 30; -static const uint64_t DEFAULT_MAXCHAINLENGTH = 200; - -SimpleFactory syntacticAnalyzerChainsFactory(SYNTACTICANALYZERCHAINS_CLASSID); - -SyntacticAnalyzerChains::SyntacticAnalyzerChains() : - m_language(), - m_chainMatrix(0), - m_maxChainsNbByVertex(std::numeric_limits::max()) -{} - -void SyntacticAnalyzerChains::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - SACLOGINIT; - m_language=manager->getInitializationParameters().media; - m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); - try - { - std::string chainMatrixId=unitConfiguration.getParamsValueAtKey("chainMatrix"); - m_chainMatrix=static_cast(LinguisticResources::single().getResource(m_language,chainMatrixId)); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no parameter 'chainMatrix' in SyntacticAnalyzerChains group for language " << (int) m_language << " !"; - throw InvalidConfiguration(); - } - try - { - std::string maxChainsNbByVertexS=unitConfiguration.getParamsValueAtKey("maxChainsNbByVertex"); - std::istringstream iss(maxChainsNbByVertexS); - iss >> m_maxChainsNbByVertex; - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no parameter 'maxChainsNbByVertex' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<> m_maxChainLength; - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no parameter 'maxChainLength' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue(id); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "No ponctu macrocategory defined ! use category PONCTU"; - m_ponctuCategory=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue("PONCTU"); - } - -} - -LimaStatusCode SyntacticAnalyzerChains::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("SyntacticAnalysis"); - SACLOGINIT; - LINFO << "start syntactic analysis - chains"; - // create syntacticData - AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); - if (anagraph==0) - { - LERROR << "no PosGraph ! abort"; - return MISSING_DATA; - } - SegmentationData* sb=static_cast(analysis.getData("SentenceBoundaries")); - if (sb==0) - { - LERROR << "no sentence bounds ! abort"; - return MISSING_DATA; - } - if (sb->getGraphId() != "PosGraph") { - LERROR << "SentenceBounds have been computed on " << sb->getGraphId() << " !"; - LERROR << "SyntacticAnalyzer-deps needs SentenceBounds on PosGraph"; - return INVALID_CONFIGURATION; - } - - SyntacticData* syntacticData=dynamic_cast(analysis.getData("SyntacticData")); - if (syntacticData==0) - { - syntacticData=new SyntacticData(anagraph,m_chainMatrix); - analysis.setData("SyntacticData",syntacticData); - } - else if (syntacticData->matrices() == 0) - { - syntacticData->matrices(m_chainMatrix); - } - syntacticData->setupDependencyGraph(); - - uint64_t chainId = m_firstChainId; - std::list ponctuMacroFilter; - ponctuMacroFilter.push_back(m_ponctuCategory); - -// bool l2r = true; - // ??OME2 for (SegmentationData::const_iterator boundItr=sb->begin(); - // boundItr!=sb->end(); - for (std::vector::const_iterator boundItr=(sb->getSegments()).begin(); - boundItr!=(sb->getSegments()).end(); - boundItr++) - { - LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); - LinguisticGraphVertex endSentence=boundItr->getLastVertex(); -// LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; - LinguisticGraphVertex current, next; - current = beginSentence; next = current; - while (next != endSentence) - { -// LDEBUG << "nextChainsBreak"; - next = anagraph->nextMainPathVertex(current,*m_macroAccessor,ponctuMacroFilter,endSentence); -// LDEBUG << "analyze chain from " << current << " to " << next; -// LDEBUG << "identify chains"; - identifyChains(syntacticData,current,next,chainId); - current = next; - } - beginSentence=endSentence; - } - - LINFO << "end syntactic analysis - chains"; - return SUCCESS_ID; -} - - -void SyntacticAnalyzerChains::identifyChains(SyntacticData* data, - const LinguisticGraphVertex& start, - const LinguisticGraphVertex& stop, - uint64_t& startChainId) const -{ -// SACLOGINIT; -// LDEBUG << "Searching chains from/to (morph): " << start << "/" << stop; - if (start == stop) - return; - VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); - std::set< std::string > alreadyReported; - LinguisticGraphVertex first = data->iterator()-> firstVertex(); - LinguisticGraphVertex last = data->iterator()-> lastVertex(); - VertexDataPropertyMap dataMap = get(vertex_data, (*data->iterator()->getGraph() ) ); -// VertexTokenPropertyMap tokenMap =get(vertex_token, (*data->iterator()->getGraph() ) ); - - std::vector< ChainStackTuple > pile; - // std::stack< LinguisticGraphVertex > pileSons; - Common::MediaticData::ChainsType currentType = Common::MediaticData::NO_CHAIN_TYPE; -// std::stack< std::pair< std::deque< ChainStackTuple >, std::stack< LinguisticGraphVertex > > > tank; - std::vector< std::vector< ChainStackTuple > > tank; - std::set< LinguisticGraphVertex > alreadyFinished; - std::vector nextVxs; -// LDEBUG << "Initializing nextVxs with " << start; - nextVxs.push_back(start); - - - while (! ( tank.empty() && nextVxs.empty()) ) - { -// LDEBUG << "LOOP"; - if (pile.size() >= m_maxChainLength) - { -#ifdef DEBUG_LP - SACLOGINIT; - LNOTICE << "Chain reached its max size or is too long."; -#endif -// LDEBUG << "Trying to find a chain end in the too long stack"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx != first) { -// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; - std::string newChainString = stringChain(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); - alreadyReported.insert(newChainString); - reportChainInGraph(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } - else { -// LDEBUG << "NoChainEndInStack"; - } - if ( ! tank.empty() ) - { -// LDEBUG << "Using a new stack after chain too long"; -// boost::tie(pile, pileSons) = tank.back(); - pile = tank.back(); - tank.pop_back(); - } - } - else if (tank.empty()) - { -// LDEBUG << "tank is empty"; - LinguisticGraphVertex nextVx = nextVxs.back(); - nextVxs.pop_back(); - while (alreadyFinished.find(nextVx) != alreadyFinished.end()) - { - if (nextVxs.empty()) - { -// LDEBUG << "Nothing more to work on: returning"; - return; - } -// LDEBUG << "Ignoring next vertex " << nextVx << " because it is already finished."; - nextVx = nextVxs.back(); - nextVxs.pop_back(); - while ((vertexChainIdMap[nextVx].size() >= m_maxChainsNbByVertex) ) - { - SACLOGINIT; - LNOTICE << "Vertex ignored (" << nextVx << ") because there is too much chains on it."; -// LDEBUG << "Ignoring next vertex " << nextVx << " because there is too much chains on it."; - if (nextVxs.empty()) - { -// LDEBUG << "Nothing more to work on: returning"; - return; - } - nextVx = nextVxs.back(); - nextVxs.pop_back(); - } - } -// LDEBUG << "next vertex is " << nextVx; - bool canFinish = false; - pile.clear(); -// pileSons = std::stack< LinguisticGraphVertex >(); - if ( (nextVx != first) && (nextVx != last) && - ( data->matrices()->canNominalChainBeginBy(dataMap[nextVx]) ) ) - { -// LDEBUG << "next vertex is a nominal chain beginning"; - canFinish = (data->matrices()-> canNominalChainEndBy(dataMap[nextVx])); - pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); - currentType = NOMINAL; - } - else if ( (nextVx != first) && (nextVx != last) && - ( data->matrices()-> canVerbalChainBeginBy(dataMap[nextVx]) ) ) - { -// LDEBUG << "next vertex is a verbal chain beginning"; - canFinish = ( data->matrices()-> canVerbalChainEndBy(dataMap[nextVx])); - pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); - currentType = VERBAL; - } - else - { -// LDEBUG << "next vertex " << nextVx << " is not a chain beginning"; - currentType = NO_CHAIN_TYPE; -// LDEBUG << "Adding nextVx " << nextVx << " to alreadyFinished"; -// alreadyFinished.insert(nextVx); - } - - if (nextVx != stop) - { - std::vector< LinguisticGraphVertex > sons; - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(nextVx, *(data->graph())); - for (; it != it_end; it++) - { -// LDEBUG << "Looking at the next vertex out edge: " << *it; - LinguisticGraphVertex nextNext = target(*it, *(data->graph())); - if (nextNext != last) - { - if ( ( alreadyFinished.find(nextNext) == alreadyFinished.end()) && (currentType != NO_CHAIN_TYPE) ) - { -// LDEBUG << "Adding " << nextNext << " to sons of " << nextVx; - sons.push_back(nextNext); - } - else - { -// LDEBUG << "Adding " << nextNext << " to nextVxs"; - nextVxs.push_back(nextNext); - // The addition of the line below seems to solve a loop problem - // whithout producing regressions in TVA tests. - alreadyFinished.insert(nextVx); - } - } - } - if (!sons.empty() && !pile.empty()) - { -// LDEBUG << nextVx << " has sons: pushing them to the tank"; -// tank.push_back(std::make_pair(pile, sons)); - pile.back().get<2>() = sons; - tank.push_back(pile); - } - } - } - else - { - LinguisticGraphVertex father = pile.back().get<0>(); - LinguisticGraphVertex currentSon = pile.back().get<2>().back(); -// LDEBUG << "Father and current son are: " << father << " / " << currentSon; - pile.back().get<2>().pop_back(); - if ( (currentType == NO_CHAIN_TYPE) && (pile.empty()) ) - { - if ( data->matrices()->canNominalChainBeginBy(dataMap[currentSon])) - currentType = NOMINAL; - else if ( data->matrices()->canVerbalChainBeginBy(dataMap[currentSon])) - currentType = VERBAL; - } - - if ( currentType != NO_CHAIN_TYPE ) - { -// LDEBUG << "Current type is " << currentType; - // -------------> - // endroit ou mettre le bloc deplace - // <------------- - if ( (currentSon != last) && - ( data->matrices()-> belongsToMatrix( - dataMap[father], - dataMap[currentSon], - currentType ) ) ) - { -// LDEBUG << father << " -> " << currentSon << " is in the matrix"; - bool canFinish = ( data->matrices()->canChainEndBy(dataMap[currentSon], currentType)); - // bloc ci-dessous a deplacer plus haut pour explorer - // toutes les chaines. Pb: rend le parcours tres tres lourd. - // -------------> - if (!pile.empty() && !pile.back().get<2>().empty()) - { -// LDEBUG << father << " has remaining sons: pushing them to the tank"; -// tank.push_back(std::make_pair(pile, pileSons)); - tank.push_back(pile); - } - // <------------- -// LDEBUG << "Pushing " << currentSon << "(" << canFinish << ")"; - pile.push_back(boost::make_tuple(currentSon, canFinish, std::vector< LinguisticGraphVertex >())); - if (currentSon != stop) - { - std::vector< LinguisticGraphVertex >& sons = pile.back().get<2>(); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(currentSon, *(data->graph())); - for (; it != it_end; it++) - { -// LDEBUG << "Edge is " << *it; -// LDEBUG << "Adding " << target(*it, *(data->graph())) << " to sons of " << currentSon; - sons.push_back(target(*it, *(data->graph()))); - } - } - else - { -// LDEBUG << "Stop reached"; - if (canFinish) - { -// LDEBUG << "currentSon " << currentSon << " is a possible end. Reporting the chain in the graph."; - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,currentSon); - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId, currentSon); - } - else - { -// LDEBUG << "currentSon " << currentSon << " is not a possible end."; -// LDEBUG << "Trying to find a chain end in the stack"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx!=first) { -// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } -// else -// { -// LDEBUG << "NoChainEndInStackException catched"; -// } - } - } - } - else - { -// LDEBUG << father << " -> " << currentSon << " NOT in the matrix"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx!=first) - { - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - if (alreadyReported.find(newChainString) == alreadyReported.end()) - { -// LDEBUG << "Reporting chain: " << newChainString; - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx << " after unstacking"; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } -// else -// { -// LDEBUG << "This chain (" << newChainString << ") has already been found. Nothing to do."; -// } - } - else - { -// LDEBUG << "No end of chain found in pile"; - if (alreadyFinished.find(currentSon) == alreadyFinished.end()) - { - if ( parentsFinished(data, father, alreadyFinished ) ) - { -// LDEBUG << "Adding father " << father << " to alreadyFinished"; - alreadyFinished.insert(father); - } - if (currentSon != last) - { -// LDEBUG << "Adding " << currentSon << " to nextVxs"; - nextVxs.push_back(currentSon); - } - else - { -// LDEBUG << "Adding current son " << currentSon << " to alreadyFinished"; - alreadyFinished.insert(currentSon); - } - } - } - } - } - - if ( (pile.empty() || pile.back().get<2>().empty()) && (! tank.empty()) ) - { -// LDEBUG << "Using a new stack"; -// boost::tie(pile, pileSons) = tank.back(); - pile = tank.back(); - tank.pop_back(); - } - } - } -// LDEBUG << "<========= chains search finished"; -} - -void SyntacticAnalyzerChains::reportChainInGraph( - SyntacticData* data, - const std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type, - std::set< LinguisticGraphVertex >& alreadyFinished, - uint64_t& chainId, - const LinguisticGraphVertex& stop) const -{ -// SACLOGINIT; -// LDEBUG << "SyntacticAnalyzerChains::reportChainInGraph"; - - ChainIdStruct property = ChainIdStruct(type, chainId); - - VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); - - std::vector< ChainStackTuple >::const_iterator it, it_end; - it = pile.begin(); it_end = pile.end(); - for (; it != it_end; it++) - { - LinguisticGraphVertex current = (*it).get<0>(); - if ((vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) ) - { - SACLOGINIT; - LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; - return; - } - } - - - std::vector< ChainStackTuple >::const_iterator it_beg, it_last; - it = pile.begin(); it_beg = pile.begin(); - it_end = pile.end(); it_last = --(pile.end()); - std::ostringstream oss; - for (; it != it_end; it++) - { - LinguisticGraphVertex current = (*it).get<0>(); - if (it == it_beg) - { - if (it_beg == it_last) - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); - else - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); - } - else if (it == it_last) - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); - } - else - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); - } - oss << current; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() - && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) - { -// LDEBUG << "executing: vertexChainIdMap[" << current << "].insert(" << property << ")"; - vertexChainIdMap[current].insert(property); - - if (pile.size() > 1) - { - std::vector< ChainStackTuple >::const_iterator it2, it2_end; - it2 = pile.begin(); it2_end = pile.end(); - bool ok = false; - for (; it2 != it2_end; it2++) - { - LinguisticGraphVertex other = (*it2).get<0>(); - if (other != current) - { - LinguisticGraphEdge e; bool found; - boost::tie (e, found) = edge(current, other, *(data->graph())); - if (found) - { - ok = true; - break; - } - else - { - boost::tie(e, found) = edge(other, current, *(data->graph())); - if (found) - { - ok = true; - break; - } - } - } - } - if (!ok) - { - SACLOGINIT; - LWARN << "An edge should exist for " << current << " !"; - } - } - } - else if (vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) - { - SACLOGINIT; - LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; - } - if (current == stop) - break; - else - oss << " "; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() - && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) - if (parentsFinished(data, current, alreadyFinished)) - { -/* LDEBUG << "Parents of " << current << " are finished ; so it too."; - alreadyFinished.insert(current);*/ - } - } -// LDEBUG << "Chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); - chainId++; - } - -bool SyntacticAnalyzerChains::parentsFinished( - const SyntacticData* data, - const LinguisticGraphVertex& v, - const std::set< LinguisticGraphVertex >& alreadyFinished) const -{ -/* - Critical function : comment logging messages -*/ -// SACLOGINIT; -// LDEBUG << "SyntacticAnalyzerChains::parentsFinished"; - - LinguisticGraphInEdgeIt it, it_end; - boost::tie(it, it_end) = in_edges(v, *(data->graph())); - for (; it != it_end; it++) - { - if (alreadyFinished.find(source(*it, *(data->graph()))) == alreadyFinished.end()) - return false; - } - return true; -} - -std::string SyntacticAnalyzerChains::stringChain( - const SyntacticData* data, - const std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type, - std::set< LinguisticGraphVertex >& alreadyFinished, - uint64_t chainId, - const LinguisticGraphVertex& stop) const -{ -/* - Critical Function : comment logging messages -*/ -// SACLOGINIT; - ChainIdStruct property = ChainIdStruct(type, chainId); - - std::vector< ChainStackTuple >::const_iterator it, it_beg, it_end, it_last; - it = pile.begin(); it_beg = pile.begin(); - it_end = pile.end(); it_last = --(pile.end()); - std::ostringstream oss; - for (; it != it_end; it++) - { - if (it == it_beg) - { - if (it_beg == it_last) - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); - else - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); - } - else if (it == it_last) - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); - } - else - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); - } - oss << (*it).get<0>(); - LinguisticGraphVertex current = (*it).get<0>(); - if (current == stop) - break; - else - oss << " "; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex()) - { - if (pile.size() > 1) - { - std::vector< ChainStackTuple >::const_iterator it2, it2_end; - // @todo replace by lookup only previous and next vertex in pile - it2 = pile.begin(); it2_end = pile.end(); - bool ok = false; - for (; it2 != it2_end; it2++) - { - LinguisticGraphVertex other = (*it2).get<0>(); - if (other != current) - { - LinguisticGraphEdge e; bool found; - boost::tie (e, found) = edge(current, other, *(data->graph())); - if (found) - { - ok = true; - break; - } - else - { - boost::tie (e, found) = edge(other, current, *(data->graph())); - if (found) - { - ok = true; - break; - } - } - } - } - if (!ok) - { - SALOGINIT; - LWARN << "An edge should exist for " << current << " !"; - } - } - } - if ( parentsFinished(data, current, alreadyFinished) ) - { -// LDEBUG << "Adding current " << current << " to alreadyFinished"; - alreadyFinished.insert(current); - } - } -// LDEBUG << "In stringChain, chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); - return oss.str(); -} - -LinguisticGraphVertex SyntacticAnalyzerChains::unstackUptoChainEnd( - const SyntacticData* data, - std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type - ) const -{ -/* - Critical function : commeng logging messages -*/ -// SACLOGINIT; -// LDEBUG << "unstackUptoChainEnd " << (type==NOMINAL?"nominal":(type==VERBAL?"verbal":"none")); - CVertexDataPropertyMap dataMap = get( vertex_data, (*data->iterator()->getGraph()) ); - - std::vector< ChainStackTuple >::const_reverse_iterator rit, rit_end; - rit = pile.rbegin(); rit_end = pile.rend(); - for (; rit != rit_end; rit++) - { - if ( data->matrices()->canChainEndBy(dataMap[(*rit).get<0>()], type)) - break; -// LDEBUG << "chain cannot finish by " << (*rit).get<0>(); - } - - if (rit != rit_end) - { - LinguisticGraphVertex newChainEnd = (*rit).get<0>(); -// LDEBUG << "Chain end found in pile: " << newChainEnd; - return (newChainEnd); - } - else - { -// LDEBUG << "No chain end found in pile !"; - return data->iterator()->firstVertex(); - } -} - -} // closing namespace SyntacticAnalysis -} // closing namespace LinguisticProcessing -} // closing namespace Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/** + * + * @file SyntacticAnalyzer-chains.cpp + * @author Gael de Chalendar (Gael.de-Chalendar@cea.fr) + + * Copyright (c) 2003 by CEA + * @date Created on Aug, 31 2004 + * @version $Id$ + * + */ + +#include "SyntacticAnalyzer-chains.h" +#include "SyntagmaticMatrix.h" + +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "common/time/timeUtilsController.h" +#include "common/LimaCommon.h" + +#undef min +#undef max + +using namespace std; +//using namespace boost; +using namespace Lima::Common::MediaticData; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SyntacticAnalysis +{ + +static const uint64_t DEFAULT_MAXCHAINSNBBYVERTEX = 30; +static const uint64_t DEFAULT_MAXCHAINLENGTH = 200; + +SimpleFactory syntacticAnalyzerChainsFactory(SYNTACTICANALYZERCHAINS_CLASSID); + +SyntacticAnalyzerChains::SyntacticAnalyzerChains() : + m_language(), + m_chainMatrix(0), + m_maxChainsNbByVertex(std::numeric_limits::max()) +{} + +void SyntacticAnalyzerChains::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + SACLOGINIT; + m_language=manager->getInitializationParameters().media; + m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); + try + { + std::string chainMatrixId=unitConfiguration.getParamsValueAtKey("chainMatrix"); + m_chainMatrix=static_cast(LinguisticResources::single().getResource(m_language,chainMatrixId)); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no parameter 'chainMatrix' in SyntacticAnalyzerChains group for language " << (int) m_language << " !"; + throw InvalidConfiguration(); + } + try + { + std::string maxChainsNbByVertexS=unitConfiguration.getParamsValueAtKey("maxChainsNbByVertex"); + std::istringstream iss(maxChainsNbByVertexS); + iss >> m_maxChainsNbByVertex; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no parameter 'maxChainsNbByVertex' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<> m_maxChainLength; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no parameter 'maxChainLength' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue(id); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "No ponctu macrocategory defined ! use category PONCTU"; + m_ponctuCategory=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue("PONCTU"); + } + +} + +LimaStatusCode SyntacticAnalyzerChains::process( + AnalysisContent& analysis) const +{ +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + Lima::TimeUtilsController timer("SyntacticAnalysis"); + SACLOGINIT; + LINFO << "start syntactic analysis - chains"; + // create syntacticData + AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); + if (anagraph==0) + { + LERROR << "no PosGraph ! abort"; + return MISSING_DATA; + } + SegmentationData* sb=static_cast(analysis.getData("SentenceBoundaries")); + if (sb==0) + { + LERROR << "no sentence bounds ! abort"; + return MISSING_DATA; + } + if (sb->getGraphId() != "PosGraph") { + LERROR << "SentenceBounds have been computed on " << sb->getGraphId() << " !"; + LERROR << "SyntacticAnalyzer-deps needs SentenceBounds on PosGraph"; + return INVALID_CONFIGURATION; + } + + SyntacticData* syntacticData=dynamic_cast(analysis.getData("SyntacticData")); + if (syntacticData==0) + { + syntacticData=new SyntacticData(anagraph,m_chainMatrix); + analysis.setData("SyntacticData",syntacticData); + } + else if (syntacticData->matrices() == 0) + { + syntacticData->matrices(m_chainMatrix); + } + syntacticData->setupDependencyGraph(); + + uint64_t chainId = m_firstChainId; + std::list ponctuMacroFilter; + ponctuMacroFilter.push_back(m_ponctuCategory); + +// bool l2r = true; + // ??OME2 for (SegmentationData::const_iterator boundItr=sb->begin(); + // boundItr!=sb->end(); + for (std::vector::const_iterator boundItr=(sb->getSegments()).begin(); + boundItr!=(sb->getSegments()).end(); + boundItr++) + { + LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); + LinguisticGraphVertex endSentence=boundItr->getLastVertex(); +// LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; + LinguisticGraphVertex current, next; + current = beginSentence; next = current; + +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif + while (next != endSentence) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif +// LDEBUG << "nextChainsBreak"; + next = anagraph->nextMainPathVertex(current,*m_macroAccessor,ponctuMacroFilter,endSentence); + +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif +// LDEBUG << "analyze chain from " << current << " to " << next; +// LDEBUG << "identify chains"; + identifyChains(syntacticData,current,next,chainId +#ifdef ANTINNO_SPECIFIC + , stopAnalyze +#endif + ); +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif + current = next; + } + beginSentence=endSentence; + } + + LINFO << "end syntactic analysis - chains"; + return SUCCESS_ID; +} + + +void SyntacticAnalyzerChains::identifyChains(SyntacticData* data, + const LinguisticGraphVertex& start, + const LinguisticGraphVertex& stop, + uint64_t& startChainId, +#ifdef ANTINNO_SPECIFIC + StopAnalyze const& stopAnalyze +#endif + ) const +{ +// SACLOGINIT; +// LDEBUG << "Searching chains from/to (morph): " << start << "/" << stop; + if (start == stop) + return; + VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); + std::set< std::string > alreadyReported; + LinguisticGraphVertex first = data->iterator()-> firstVertex(); + LinguisticGraphVertex last = data->iterator()-> lastVertex(); + VertexDataPropertyMap dataMap = get(vertex_data, (*data->iterator()->getGraph() ) ); +// VertexTokenPropertyMap tokenMap =get(vertex_token, (*data->iterator()->getGraph() ) ); + + std::vector< ChainStackTuple > pile; + // std::stack< LinguisticGraphVertex > pileSons; + Common::MediaticData::ChainsType currentType = Common::MediaticData::NO_CHAIN_TYPE; +// std::stack< std::pair< std::deque< ChainStackTuple >, std::stack< LinguisticGraphVertex > > > tank; + std::vector< std::vector< ChainStackTuple > > tank; + std::set< LinguisticGraphVertex > alreadyFinished; + std::vector nextVxs; +// LDEBUG << "Initializing nextVxs with " << start; + nextVxs.push_back(start); + + + while (! ( tank.empty() && nextVxs.empty()) ) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif + // LDEBUG << "LOOP"; + if (pile.size() >= m_maxChainLength) + { +#ifdef DEBUG_LP + SACLOGINIT; + LNOTICE << "Chain reached its max size or is too long."; +#endif + +// LDEBUG << "Trying to find a chain end in the too long stack"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx != first) { +// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; + std::string newChainString = stringChain(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); + alreadyReported.insert(newChainString); + reportChainInGraph(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx; + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { +#ifdef DEBUG_LP + SACLOGINIT + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; +#endif + return; + } +#endif +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } + else { +// LDEBUG << "NoChainEndInStack"; + } + if ( ! tank.empty() ) + { +// LDEBUG << "Using a new stack after chain too long"; +// boost::tie(pile, pileSons) = tank.back(); + pile = tank.back(); + tank.pop_back(); + } + } + else if (tank.empty()) + { +// LDEBUG << "tank is empty"; + LinguisticGraphVertex nextVx = nextVxs.back(); + nextVxs.pop_back(); + while (alreadyFinished.find(nextVx) != alreadyFinished.end()) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif + if (nextVxs.empty()) + { +// LDEBUG << "Nothing more to work on: returning"; + return; + } +// LDEBUG << "Ignoring next vertex " << nextVx << " because it is already finished."; + nextVx = nextVxs.back(); + nextVxs.pop_back(); + while ((vertexChainIdMap[nextVx].size() >= m_maxChainsNbByVertex) ) + { + SACLOGINIT; + LNOTICE << "Vertex ignored (" << nextVx << ") because there is too much chains on it."; +// LDEBUG << "Ignoring next vertex " << nextVx << " because there is too much chains on it."; + if (nextVxs.empty()) + { +// LDEBUG << "Nothing more to work on: returning"; + return; + } + nextVx = nextVxs.back(); + nextVxs.pop_back(); + } + } +// LDEBUG << "next vertex is " << nextVx; + bool canFinish = false; + pile.clear(); +// pileSons = std::stack< LinguisticGraphVertex >(); + if ( (nextVx != first) && (nextVx != last) && + ( data->matrices()->canNominalChainBeginBy(dataMap[nextVx]) ) ) + { +// LDEBUG << "next vertex is a nominal chain beginning"; + canFinish = (data->matrices()-> canNominalChainEndBy(dataMap[nextVx])); + pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); + currentType = NOMINAL; + } + else if ( (nextVx != first) && (nextVx != last) && + ( data->matrices()-> canVerbalChainBeginBy(dataMap[nextVx]) ) ) + { +// LDEBUG << "next vertex is a verbal chain beginning"; + canFinish = ( data->matrices()-> canVerbalChainEndBy(dataMap[nextVx])); + pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); + currentType = VERBAL; + } + else + { +// LDEBUG << "next vertex " << nextVx << " is not a chain beginning"; + currentType = NO_CHAIN_TYPE; +// LDEBUG << "Adding nextVx " << nextVx << " to alreadyFinished"; +// alreadyFinished.insert(nextVx); + } + + if (nextVx != stop) + { + std::vector< LinguisticGraphVertex > sons; + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(nextVx, *(data->graph())); + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << "Looking at the next vertex out edge: " << *it; + LinguisticGraphVertex nextNext = target(*it, *(data->graph())); + if (nextNext != last) + { + if ( ( alreadyFinished.find(nextNext) == alreadyFinished.end()) && (currentType != NO_CHAIN_TYPE) ) + { +// LDEBUG << "Adding " << nextNext << " to sons of " << nextVx; + sons.push_back(nextNext); + } + else + { +// LDEBUG << "Adding " << nextNext << " to nextVxs"; + nextVxs.push_back(nextNext); + // The addition of the line below seems to solve a loop problem + // whithout producing regressions in TVA tests. + alreadyFinished.insert(nextVx); + } + } + } + if (!sons.empty() && !pile.empty()) + { +// LDEBUG << nextVx << " has sons: pushing them to the tank"; +// tank.push_back(std::make_pair(pile, sons)); + pile.back().get<2>() = sons; + tank.push_back(pile); + } + } + } + else + { + LinguisticGraphVertex father = pile.back().get<0>(); + LinguisticGraphVertex currentSon = pile.back().get<2>().back(); +// LDEBUG << "Father and current son are: " << father << " / " << currentSon; + pile.back().get<2>().pop_back(); + if ( (currentType == NO_CHAIN_TYPE) && (pile.empty()) ) + { + if ( data->matrices()->canNominalChainBeginBy(dataMap[currentSon])) + currentType = NOMINAL; + else if ( data->matrices()->canVerbalChainBeginBy(dataMap[currentSon])) + currentType = VERBAL; + } + + if ( currentType != NO_CHAIN_TYPE ) + { +// LDEBUG << "Current type is " << currentType; + // -------------> + // endroit ou mettre le bloc deplace + // <------------- + if ( (currentSon != last) && + ( data->matrices()-> belongsToMatrix( + dataMap[father], + dataMap[currentSon], + currentType ) ) ) + { +// LDEBUG << father << " -> " << currentSon << " is in the matrix"; + bool canFinish = ( data->matrices()->canChainEndBy(dataMap[currentSon], currentType)); + // bloc ci-dessous a deplacer plus haut pour explorer + // toutes les chaines. Pb: rend le parcours tres tres lourd. + // -------------> + if (!pile.empty() && !pile.back().get<2>().empty()) + { +// LDEBUG << father << " has remaining sons: pushing them to the tank"; +// tank.push_back(std::make_pair(pile, pileSons)); + tank.push_back(pile); + } + // <------------- +// LDEBUG << "Pushing " << currentSon << "(" << canFinish << ")"; + pile.push_back(boost::make_tuple(currentSon, canFinish, std::vector< LinguisticGraphVertex >())); + if (currentSon != stop) + { + std::vector< LinguisticGraphVertex >& sons = pile.back().get<2>(); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(currentSon, *(data->graph())); + for (; it != it_end; it++) + { +// LDEBUG << "Edge is " << *it; +// LDEBUG << "Adding " << target(*it, *(data->graph())) << " to sons of " << currentSon; + sons.push_back(target(*it, *(data->graph()))); + } + } + else + { +// LDEBUG << "Stop reached"; + if (canFinish) + { +// LDEBUG << "currentSon " << currentSon << " is a possible end. Reporting the chain in the graph."; + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,currentSon); + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId, currentSon); + } + else + { +// LDEBUG << "currentSon " << currentSon << " is not a possible end."; +// LDEBUG << "Trying to find a chain end in the stack"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx!=first) { +// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx; + for (; it != it_end; it++) + { +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } +// else +// { +// LDEBUG << "NoChainEndInStackException catched"; +// } + } + } + } + else + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << father << " -> " << currentSon << " NOT in the matrix"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx!=first) + { + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + if (alreadyReported.find(newChainString) == alreadyReported.end()) + { +// LDEBUG << "Reporting chain: " << newChainString; + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx << " after unstacking"; + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } +// else +// { +// LDEBUG << "This chain (" << newChainString << ") has already been found. Nothing to do."; +// } + } + else + { +// LDEBUG << "No end of chain found in pile"; + if (alreadyFinished.find(currentSon) == alreadyFinished.end()) + { + if ( parentsFinished(data, father, alreadyFinished ) ) + { +// LDEBUG << "Adding father " << father << " to alreadyFinished"; + alreadyFinished.insert(father); + } + if (currentSon != last) + { +// LDEBUG << "Adding " << currentSon << " to nextVxs"; + nextVxs.push_back(currentSon); + } + else + { +// LDEBUG << "Adding current son " << currentSon << " to alreadyFinished"; + alreadyFinished.insert(currentSon); + } + } + } + } + } + + if ( (pile.empty() || pile.back().get<2>().empty()) && (! tank.empty()) ) + { +// LDEBUG << "Using a new stack"; +// boost::tie(pile, pileSons) = tank.back(); + pile = tank.back(); + tank.pop_back(); + } + } + } +// LDEBUG << "<========= chains search finished"; +} + +void SyntacticAnalyzerChains::reportChainInGraph( + SyntacticData* data, + const std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type, + std::set< LinguisticGraphVertex >& alreadyFinished, + uint64_t& chainId, + const LinguisticGraphVertex& stop) const +{ +// SACLOGINIT; +// LDEBUG << "SyntacticAnalyzerChains::reportChainInGraph"; + + ChainIdStruct property = ChainIdStruct(type, chainId); + + VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); + + std::vector< ChainStackTuple >::const_iterator it, it_end; + it = pile.begin(); it_end = pile.end(); + for (; it != it_end; it++) + { + LinguisticGraphVertex current = (*it).get<0>(); + if ((vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) ) + { + SACLOGINIT; + LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; + return; + } + } + + + std::vector< ChainStackTuple >::const_iterator it_beg, it_last; + it = pile.begin(); it_beg = pile.begin(); + it_end = pile.end(); it_last = --(pile.end()); + std::ostringstream oss; + for (; it != it_end; it++) + { + LinguisticGraphVertex current = (*it).get<0>(); + if (it == it_beg) + { + if (it_beg == it_last) + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); + else + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); + } + else if (it == it_last) + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); + } + else + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); + } + oss << current; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() + && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) + { +// LDEBUG << "executing: vertexChainIdMap[" << current << "].insert(" << property << ")"; + vertexChainIdMap[current].insert(property); + + if (pile.size() > 1) + { + std::vector< ChainStackTuple >::const_iterator it2, it2_end; + it2 = pile.begin(); it2_end = pile.end(); + bool ok = false; + for (; it2 != it2_end; it2++) + { + LinguisticGraphVertex other = (*it2).get<0>(); + if (other != current) + { + LinguisticGraphEdge e; bool found; + boost::tie (e, found) = edge(current, other, *(data->graph())); + if (found) + { + ok = true; + break; + } + else + { + boost::tie(e, found) = edge(other, current, *(data->graph())); + if (found) + { + ok = true; + break; + } + } + } + } + if (!ok) + { + SACLOGINIT; + LWARN << "An edge should exist for " << current << " !"; + } + } + } + else if (vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) + { + SACLOGINIT; + LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; + } + if (current == stop) + break; + else + oss << " "; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() + && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) + if (parentsFinished(data, current, alreadyFinished)) + { +/* LDEBUG << "Parents of " << current << " are finished ; so it too."; + alreadyFinished.insert(current);*/ + } + } +// LDEBUG << "Chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); + chainId++; + } + +bool SyntacticAnalyzerChains::parentsFinished( + const SyntacticData* data, + const LinguisticGraphVertex& v, + const std::set< LinguisticGraphVertex >& alreadyFinished) const +{ +/* + Critical function : comment logging messages +*/ +// SACLOGINIT; +// LDEBUG << "SyntacticAnalyzerChains::parentsFinished"; + + LinguisticGraphInEdgeIt it, it_end; + boost::tie(it, it_end) = in_edges(v, *(data->graph())); + for (; it != it_end; it++) + { + if (alreadyFinished.find(source(*it, *(data->graph()))) == alreadyFinished.end()) + return false; + } + return true; +} + +std::string SyntacticAnalyzerChains::stringChain( + const SyntacticData* data, + const std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type, + std::set< LinguisticGraphVertex >& alreadyFinished, + uint64_t chainId, + const LinguisticGraphVertex& stop) const +{ +/* + Critical Function : comment logging messages +*/ +// SACLOGINIT; + ChainIdStruct property = ChainIdStruct(type, chainId); + + std::vector< ChainStackTuple >::const_iterator it, it_beg, it_end, it_last; + it = pile.begin(); it_beg = pile.begin(); + it_end = pile.end(); it_last = --(pile.end()); + std::ostringstream oss; + for (; it != it_end; it++) + { + if (it == it_beg) + { + if (it_beg == it_last) + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); + else + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); + } + else if (it == it_last) + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); + } + else + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); + } + oss << (*it).get<0>(); + LinguisticGraphVertex current = (*it).get<0>(); + if (current == stop) + break; + else + oss << " "; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex()) + { + if (pile.size() > 1) + { + std::vector< ChainStackTuple >::const_iterator it2, it2_end; + // @todo replace by lookup only previous and next vertex in pile + it2 = pile.begin(); it2_end = pile.end(); + bool ok = false; + for (; it2 != it2_end; it2++) + { + LinguisticGraphVertex other = (*it2).get<0>(); + if (other != current) + { + LinguisticGraphEdge e; bool found; + boost::tie (e, found) = edge(current, other, *(data->graph())); + if (found) + { + ok = true; + break; + } + else + { + boost::tie (e, found) = edge(other, current, *(data->graph())); + if (found) + { + ok = true; + break; + } + } + } + } + if (!ok) + { + SALOGINIT; + LWARN << "An edge should exist for " << current << " !"; + } + } + } + if ( parentsFinished(data, current, alreadyFinished) ) + { +// LDEBUG << "Adding current " << current << " to alreadyFinished"; + alreadyFinished.insert(current); + } + } +// LDEBUG << "In stringChain, chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); + return oss.str(); +} + +LinguisticGraphVertex SyntacticAnalyzerChains::unstackUptoChainEnd( + const SyntacticData* data, + std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type + ) const +{ +/* + Critical function : commeng logging messages +*/ +// SACLOGINIT; +// LDEBUG << "unstackUptoChainEnd " << (type==NOMINAL?"nominal":(type==VERBAL?"verbal":"none")); + CVertexDataPropertyMap dataMap = get( vertex_data, (*data->iterator()->getGraph()) ); + + std::vector< ChainStackTuple >::const_reverse_iterator rit, rit_end; + rit = pile.rbegin(); rit_end = pile.rend(); + for (; rit != rit_end; rit++) + { + if ( data->matrices()->canChainEndBy(dataMap[(*rit).get<0>()], type)) + break; +// LDEBUG << "chain cannot finish by " << (*rit).get<0>(); + } + + if (rit != rit_end) + { + LinguisticGraphVertex newChainEnd = (*rit).get<0>(); +// LDEBUG << "Chain end found in pile: " << newChainEnd; + return (newChainEnd); + } + else + { +// LDEBUG << "No chain end found in pile !"; + return data->iterator()->firstVertex(); + } +} + +} // closing namespace SyntacticAnalysis +} // closing namespace LinguisticProcessing +} // closing namespace Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h index c5edf04de..8b7e5d76f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h @@ -118,7 +118,12 @@ class LIMA_SYNTACTICANALYSIS_EXPORT SyntacticAnalyzerChains : public MediaProces void identifyChains(SyntacticData* data, const LinguisticGraphVertex& s, const LinguisticGraphVertex& t, +#ifdef ANTINNO_SPECIFIC + uint64_t& startChainId, StopAnalyze const& stopAnalyze = defaultStopAnalyze) const; +#else uint64_t& startChainId) const; +#endif + /** diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp index 8664bc0a9..92b3da6eb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp @@ -115,6 +115,10 @@ LimaStatusCode SyntacticAnalyzerDeps::process( SAPLOGINIT; LINFO << "start syntactic analysis - dependence relations search"; +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); if (anagraph==0) { @@ -155,9 +159,9 @@ LimaStatusCode SyntacticAnalyzerDeps::process( { LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); LinguisticGraphVertex endSentence=boundItr->getLastVertex(); -// LDEBUG << "analyze sentence from vertex " << beginSentence -// << " to vertex " << endSentence; - +#ifdef DEBUG_LP + LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; +#endif std::deque< std::string >::const_iterator actionsit, actionsit_end; actionsit = m_actions.begin(); actionsit_end = m_actions.end(); for (; actionsit != actionsit_end; actionsit++) @@ -169,10 +173,14 @@ LimaStatusCode SyntacticAnalyzerDeps::process( } else { -// LDEBUG << "Geting automaton"; +#ifdef DEBUG_LP + LDEBUG << "Geting automaton for action" << action; +#endif Automaton::Recognizer* recognizer = const_cast< Automaton::Recognizer* >((*(m_recognizers.find(action))).second); std::vector result; -// LDEBUG << "Applying automaton for action " << action << " on sentence from " << beginSentence << " to " << endSentence; +#ifdef DEBUG_LP + LDEBUG << "Applying automaton for action " << action << " on sentence from " << beginSentence << " to " << endSentence; +#endif try { recognizer->apply(*anagraph, @@ -186,6 +194,13 @@ LimaStatusCode SyntacticAnalyzerDeps::process( false, // return at first success=false m_applySameRuleWhileSuccess // depends on config file ); +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped SyntacticAnalyzerDeps "; + return TIME_OVERFLOW; + } +#endif } catch (const PhoenixGraphHomoDepsVisitor::StartFinishedException& e) {} } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp index b3cc8b66c..70282282b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp @@ -66,6 +66,8 @@ void SyntacticAnalyzerSimplify::init( Manager* manager) { + SASLOGINIT; + LINFO << "SyntacticAnalyzerSimplify::init"; m_language=manager->getInitializationParameters().media; std::string rules=unitConfiguration.getParamsValueAtKey("simplifyAutomaton"); m_recognizer = static_cast(LinguisticResources::single().getResource(m_language,rules)); @@ -76,7 +78,7 @@ LimaStatusCode SyntacticAnalyzerSimplify::process( { Lima::TimeUtilsController timer("SyntacticAnalysis"); SASLOGINIT; - LINFO << "start syntactic analysis - subsentences simplification"; + LINFO << "SyntacticAnalyzerSimplify::process"; AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); if (anagraph==0) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp index 5ee1a0695..983c80596 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp @@ -29,6 +29,7 @@ #include "SyntagmaticMatrix.h" #include "XmlSyntagmaticMatrixFileHandler.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include @@ -77,9 +78,8 @@ void SyntagmDefStruct::init( m_verbalMatrix.language(m_language); try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string matricesFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("file"); - loadFromFile(matricesFileName); + QString matricesFileName = findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("file").c_str()); + loadFromFile(matricesFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp index 8cd729c12..4f3127d2c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp @@ -109,7 +109,11 @@ LimaStatusCode SegmentationResultsLoader::process(AnalysisContent& analysis) con SegmentationResultsLoader::XMLHandler handler(segmData,graph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); +#ifdef ANTINNO_SPECIFIC QFile file(getInputFile(analysis).c_str()); +#else + QFile file(getInputFile(analysis)); +#endif if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp index 017a060d8..e92f1ce1b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp @@ -128,6 +128,10 @@ LimaStatusCode SentenceBoundariesFinder::process( return MISSING_DATA; } +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + LinguisticGraphVertex lastVx=anagraph->lastVertex(); LinguisticGraphVertex beginSentence=anagraph->firstVertex(); #ifdef DEBUG_LP @@ -140,6 +144,13 @@ LimaStatusCode SentenceBoundariesFinder::process( if (m_boundaryValues.empty()) { while (beginSentence!=lastVx) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif LinguisticGraphVertex endSentence=anagraph->nextMainPathVertex(beginSentence,*m_microAccessor,m_boundaryMicros,lastVx); #ifdef DEBUG_LP LDEBUG << "found endSentence at " << endSentence; @@ -154,6 +165,13 @@ LimaStatusCode SentenceBoundariesFinder::process( LinguisticGraphVertex endSentence=anagraph->nextMainPathVertex(beginSentence,*m_microAccessor,m_boundaryMicros,lastVx); while (endSentence!=lastVx) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif Token* t=get(vertex_token,*(anagraph->getGraph()),endSentence); #ifdef DEBUG_LP if (t!=0) { @@ -165,6 +183,13 @@ LimaStatusCode SentenceBoundariesFinder::process( } #endif if (t==0 || m_boundaryValues.find(t->stringForm())!=m_boundaryValues.end()) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif sb->add(Segment("sentence",beginSentence,endSentence,anagraph)); beginSentence=endSentence; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp index b7059aa12..38d893011 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp @@ -23,7 +23,7 @@ int main(int argc,char* argv[]) std::string lineString; size_t linenum(0); - getline(ifl, lineString); + lineString = Lima::Common::Misc::readLine(ifl); while (ifl.good() && !ifl.eof()) { Common::Misc::chomp(lineString); diff --git a/lima_linguisticprocessing/test/analyzeText.cpp b/lima_linguisticprocessing/test/analyzeText.cpp index afb7772f5..95e7fb58e 100644 --- a/lima_linguisticprocessing/test/analyzeText.cpp +++ b/lima_linguisticprocessing/test/analyzeText.cpp @@ -30,6 +30,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/Data/strwstrtools.h" #include "common/time/traceUtils.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLog.h" #include "common/QsLog/QsLogDest.h" #include "common/QsLog/QsLogCategories.h" @@ -41,6 +42,7 @@ #include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" #include "linguisticProcessing/client/AnalysisHandlers/BowTextHandler.h" #include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" +#include "linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h" #include "linguisticProcessing/core/EventAnalysis/EventHandler.h" #include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -91,17 +93,22 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); - // Necessary to initialize factories under Windows + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + QsLogging::initQsLog(configPath); + // Necessary to initialize factories Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); // std::cerr << "Amose plugins initialized" << std::endl; - std::string resourcesPath; - std::string configDir; + std::string strResourcesPath; std::string lpConfigFile; std::string commonConfigFile; std::string clientId; - std::string cesartOutput; std::vector languages; std::vector dumpersv; std::vector outputsv; @@ -110,6 +117,7 @@ int run(int argc,char** argv) std::vector vinactiveUnits; std::string meta; std::string splitMode; + std::string strConfigPath; po::options_description desc("Usage"); @@ -123,9 +131,9 @@ int run(int argc,char** argv) "where to write dumpers output. By default, each dumper writes its results on a file whose name is the input file with a predefined suffix appended. This option allows to chose another suffix or to write on standard output. Its syntax is the following: : with a dumper name and destination, either the value 'stdout' or a suffix.") ("mm-core-client", po::value(&clientId)->default_value("lima-coreclient"), "Set the linguistic processing client to use") - ("resources-dir", po::value(&resourcesPath)->default_value(qgetenv("LIMA_RESOURCES").constData()==0?"":qgetenv("LIMA_RESOURCES").constData(),"$LIMA_RESOURCES"), + ("resources-dir", po::value(&strResourcesPath), "Set the directory containing the LIMA linguistic resources") - ("config-dir", po::value(&configDir)->default_value(qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData(),"$LIMA_CONF"), + ("config-dir", po::value(&strConfigPath), "Set the directory containing the (LIMA) configuration files") ("common-config-file", po::value(&commonConfigFile)->default_value("lima-common.xml"), "Set the LIMA common libraries configuration file to use") @@ -158,13 +166,15 @@ int run(int argc,char** argv) std::cout << desc << std::endl; return SUCCESS_ID; } - if (resourcesPath.empty()) + if (!strResourcesPath.empty()) { - resourcesPath = "/usr/share/apps/lima/resources/"; + resourcesPath = QString::fromUtf8(strResourcesPath.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); } - if (configDir.empty()) + if (!strConfigPath.empty()) { - configDir = "/usr/share/config/lima/"; + configPath = QString::fromUtf8(strConfigPath.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); } std::deque langs(languages.size()); std::copy(languages.begin(), languages.end(), langs.begin()); @@ -242,24 +252,36 @@ int run(int argc,char** argv) uint64_t beginTime=TimeUtils::getCurrentTime(); - AbstractLinguisticProcessingClient* client(0); - // initialize common Common::MediaticData::MediaticData::changeable().init( - resourcesPath, - configDir, + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); - // initialize linguistic processing - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); - LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); + bool clientFactoryConfigured = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + lpConfigFile.c_str()).exists()) + { + // initialize linguistic processing + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig((configDir + "/" + lpConfigFile.c_str()).toStdString()); + LinguisticProcessingClientFactory::changeable().configureClientFactory( + clientId, + lpconfig, + langs, + pipelines); + clientFactoryConfigured = true; + break; + } + } + if(!clientFactoryConfigured) + { +// std::cerr << "No LinguisticProcessingClientFactory were configured with" << configDirs.join(LIMA_PATH_SEPARATOR).toStdString() << "and" << lpConfigFile << std::endl; + return EXIT_FAILURE; + } - client=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + std::shared_ptr< AbstractLinguisticProcessingClient > client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -268,6 +290,7 @@ int run(int argc,char** argv) BowTextHandler* bowTextHandler = 0; SimpleStreamHandler* simpleStreamHandler = 0; SimpleStreamHandler* fullXmlSimpleStreamHandler = 0; + LTRTextHandler* ltrTextHandler=0; if (dumpers.find("event") != dumpers.end()) { @@ -294,6 +317,11 @@ int run(int argc,char** argv) fullXmlSimpleStreamHandler = new SimpleStreamHandler(); handlers.insert(std::make_pair("fullXmlSimpleStreamHandler", fullXmlSimpleStreamHandler)); } + if (dumpers.find("ltr") != dumpers.end()) + { + ltrTextHandler= new LTRTextHandler(); + handlers.insert(std::make_pair("ltrTextHandler", ltrTextHandler)); + } std::map metaData; @@ -342,7 +370,7 @@ int run(int argc,char** argv) if (splitMode == "lines") { int lineNum = 0, nbLines = 0; - std::cerr << "Counting number of lines…"; + std::cerr << "Counting number of lines"; while (!file.atEnd()) { file.readLine(); @@ -388,22 +416,28 @@ int run(int argc,char** argv) closeHandlerOutputFile(fullxmlofs); } std::cout << std::endl; - delete client; // free handlers if (eventHandler != 0) delete eventHandler; if (bowTextWriter!= 0) delete bowTextWriter; - if (bowTextHandler!= 0) - delete bowTextHandler; if (simpleStreamHandler!= 0) delete simpleStreamHandler; if (fullXmlSimpleStreamHandler!= 0) delete fullXmlSimpleStreamHandler; - delete Common::MediaticData::MediaticData::pchangeable(); - delete LinguisticProcessingClientFactory::pchangeable(); + if (bowTextHandler!= 0) { + // not handled in output file: just print on output (this should just be used for testing) + std::cout << bowTextHandler->getBowText(); + delete bowTextHandler; + } + if (ltrTextHandler!= 0) { + // not handled in output file: just print on output (this should just be used for testing) + std::cout << ltrTextHandler->getLTRText(); + delete ltrTextHandler; + } TIMELOGINIT; LINFO << "Total: " << TimeUtils::diffTime(beginTime,TimeUtils::getCurrentTime()) << " ms"; + TimeUtils::logAllCumulatedTime("et finalement..."); return SUCCESS_ID; } diff --git a/lima_linguisticprocessing/test/analyzetextservercore.cpp b/lima_linguisticprocessing/test/analyzetextservercore.cpp index 9b5f90464..a938ab02a 100644 --- a/lima_linguisticprocessing/test/analyzetextservercore.cpp +++ b/lima_linguisticprocessing/test/analyzetextservercore.cpp @@ -139,10 +139,9 @@ int run(int argc,char** argv) uint64_t beginTime=TimeUtils::getCurrentTime(); - AbstractLinguisticProcessingClient* client(0); - std::map handlers; + std::shared_ptr< AbstractLinguisticProcessingClient > client; try { // initialize common @@ -160,7 +159,7 @@ int run(int argc,char** argv) langs, pipelines); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); } catch (InvalidConfiguration& e) { @@ -366,7 +365,6 @@ int run(int argc,char** argv) std::cout << "ERROR: unknown error." << std::endl; } } - delete client; TIMELOGINIT; LINFO << "Total: " << TimeUtils::diffTime(beginTime,TimeUtils::getCurrentTime()) << " ms"; diff --git a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp index 1734d4227..f40961d9f 100644 --- a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp +++ b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include // std::stringstream @@ -53,19 +52,20 @@ class AnalysisWrapperPrivate { friend class AnalysisWrapper; public: - AnalysisWrapperPrivate (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, + AnalysisWrapperPrivate (std::shared_ptr< AbstractLinguisticProcessingClient > analyzer, const std::set& langs); ~AnalysisWrapperPrivate() {} std::ostream* openHandlerOutputFile(AbstractTextualAnalysisHandler* handler, const std::string& fileName, const std::set&dumpers, const std::string& dumperId); void closeHandlerOutputFile(std::ostream* ofs); - boost::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; const std::set& m_langs; }; -AnalysisWrapperPrivate::AnalysisWrapperPrivate(Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, - const std::set& langs) : +AnalysisWrapperPrivate::AnalysisWrapperPrivate( + std::shared_ptr< AbstractLinguisticProcessingClient > analyzer, + const std::set& langs) : m_analyzer(analyzer), m_langs(langs) { @@ -106,7 +106,7 @@ void AnalysisWrapperPrivate::closeHandlerOutputFile(std::ostream* ofs) } -AnalysisWrapper::AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, +AnalysisWrapper::AnalysisWrapper (std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > analyzer, const std::set& langs, QObject* parent ): QObject(parent), m_d(new AnalysisWrapperPrivate(analyzer,langs)) @@ -117,8 +117,6 @@ AnalysisWrapper::AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguistic AnalysisWrapper::~AnalysisWrapper() { - CORECLIENTLOGINIT; - LDEBUG << "AnalysisWrapper::~AnalysisWrapper"; delete m_d; } diff --git a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h index 28949f638..9230e9f54 100644 --- a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h +++ b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h @@ -22,6 +22,7 @@ #define ANALYSISWRAPPER_H #include +#include #include #include @@ -45,8 +46,9 @@ class AnalysisWrapper : public QObject { Q_OBJECT public: - AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* m_analyzer, - const std::set& langs, QObject* parent = 0 ); + AnalysisWrapper ( + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > analyzer, + const std::set& langs, QObject* parent = 0 ); virtual ~AnalysisWrapper(); QString analyze(const QString& text, const QString& language, const QString& pipeline); diff --git a/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp b/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp index d9eb99972..7d6f0fab1 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp +++ b/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp @@ -93,7 +93,7 @@ LimaDBusServerPrivate::LimaDBusServerPrivate( const std::string& configDir, pipelines); LDEBUG << "LimaDBusServer::LimaDBusServer: createClient..."; - m_analyzer=new AnalysisWrapper(static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)),m_langs,p); + m_analyzer=new AnalysisWrapper(std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)),m_langs,p); } diff --git a/lima_linguisticprocessing/test/limaServer/LimaServer.cpp b/lima_linguisticprocessing/test/limaServer/LimaServer.cpp index af43b4191..97c680e7b 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaServer.cpp +++ b/lima_linguisticprocessing/test/limaServer/LimaServer.cpp @@ -122,7 +122,7 @@ LimaServer::LimaServer( const std::string& configDir, pipelines); LDEBUG << "LimaServer::LimaServer: createClient..."; - m_analyzer=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + m_analyzer = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); LDEBUG << "LimaServer::LimaServer: create QHttpServer..."; m_server = new QHttpServer(this); @@ -137,17 +137,6 @@ LimaServer::LimaServer( const std::string& configDir, LimaServer::~LimaServer() { - CORECLIENTLOGINIT; - LINFO << "LimaServer::~LimaServer"; - // free client - LINFO << "LimaServer::~LimaServer: httpserver deleted!"; - delete m_analyzer; - LINFO << "LimaServer::~LimaServer: m_analyzer deleted"; - // free MediaticData ??? - delete Common::MediaticData::MediaticData::pchangeable(); - LINFO << "LimaServer::~LimaServer: mediaticData deleted"; - // free linguistic processing ressources - delete LinguisticProcessingClientFactory::pchangeable(); } void LimaServer::quit() { @@ -166,7 +155,7 @@ void LimaServer::handleRequest(QHttpRequest *req, QHttpResponse *resp) CORECLIENTLOGINIT; req->storeBody(); LDEBUG << "LimaServer::handleRequest: create AnalysisThread..."; - AnalysisThread *thread = new AnalysisThread(m_analyzer, req, resp, m_langs, this ); + AnalysisThread *thread = new AnalysisThread(m_analyzer.get(), req, resp, m_langs, this ); connect(req,SIGNAL(end()),thread,SLOT(startAnalysis())); connect(thread, SIGNAL(finished()), thread, SLOT(deleteLater())); thread->start(); diff --git a/lima_linguisticprocessing/test/limaServer/LimaServer.h b/lima_linguisticprocessing/test/limaServer/LimaServer.h index 27575b189..01484c5df 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaServer.h +++ b/lima_linguisticprocessing/test/limaServer/LimaServer.h @@ -47,6 +47,7 @@ #include #include #include +#include class QTimer; @@ -80,7 +81,7 @@ private Q_SLOTS: QTimer* m_timer; - Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* m_analyzer; + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; }; #endif diff --git a/lima_linguisticprocessing/test/limaServer/analysisthread.cpp b/lima_linguisticprocessing/test/limaServer/analysisthread.cpp index f1790c6fb..b695242a7 100644 --- a/lima_linguisticprocessing/test/limaServer/analysisthread.cpp +++ b/lima_linguisticprocessing/test/limaServer/analysisthread.cpp @@ -86,8 +86,6 @@ AnalysisThread::AnalysisThread (Lima::LinguisticProcessing::AbstractLinguisticPr AnalysisThread::~AnalysisThread() { - CORECLIENTLOGINIT; - LDEBUG << "AnalysisThread::~AnalysisThread"; delete m_d; } diff --git a/lima_linguisticprocessing/test/limaServer/main.cpp b/lima_linguisticprocessing/test/limaServer/main.cpp index 1bbe076a6..4bed11d8b 100644 --- a/lima_linguisticprocessing/test/limaServer/main.cpp +++ b/lima_linguisticprocessing/test/limaServer/main.cpp @@ -25,6 +25,7 @@ #include #include "common/LimaCommon.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLogCategories.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" @@ -43,9 +44,13 @@ namespace po = boost::program_options; int main(int argc, char **argv) { + QStringList configDirs = Misc::buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + QCoreApplication app(argc, argv); - QsLogging::initQsLog(); + QsLogging::initQsLog(configPath); Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); std::cerr << "Amose plugins initialized" << std::endl; QsLogging::initQsLog(); diff --git a/lima_linguisticprocessing/test/srl.cpp b/lima_linguisticprocessing/test/srl.cpp index 65bcb2709..a00097fd1 100644 --- a/lima_linguisticprocessing/test/srl.cpp +++ b/lima_linguisticprocessing/test/srl.cpp @@ -54,8 +54,11 @@ std::string text = "1 The the DET DT _ _ 2 NMOD _ _\n" break; } } +#ifndef WIN32 Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); - +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif Py_Initialize(); diff --git a/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp b/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp index f7e019dd3..10091cde9 100644 --- a/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp +++ b/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp @@ -225,12 +225,12 @@ void readCommandLineArguments(uint64_t argc, char *argv[]) } } if (param.resourcesPath.empty()) { - char* resourcesStr = getenv("LIMA_RESOURCES"); + const char* resourcesStr = qgetenv("LIMA_RESOURCES").constData(); if (resourcesStr != NULL) { param.resourcesPath = resourcesStr; } else { cerr << "$LIMA_RESOURCES not defined" << endl; exit(1); } } if (param.configDir.empty()) { - char* configStr = getenv("LIMA_CONF"); + const char* configStr = qgetenv("LIMA_CONF").constData(); if (configStr != NULL) { param.configDir = configStr; } else { cerr << "$LIMA_CONF not defined" << endl; exit(1); } } @@ -240,8 +240,7 @@ void readCommandLineArguments(uint64_t argc, char *argv[]) // local getline void localGetline(ifstream& file, LimaString& line) { - string str; - getline(file,str); + string str = Lima::Common::Misc::readLine(file); if (param.encoding=="latin1") { line = Misc::latin15stdstring2limastring(str); } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp b/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp index 719562300..41cff255e 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp @@ -1,540 +1,615 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/****************************************************************************** -* -* File : compile-rules.cpp -* Author : Besancon Romaric (besanconr@zoe.cea.fr) -* Created on : Fri Oct 25 2002 -* Copyright : (c) 2002 by CEA -* Version : $Id$ -* -******************************************************************************/ - - -#ifdef HAVE_CONFIG_H -#include -#endif - -#include "compilerExceptions.h" -#include "libautomatonCompiler/recognizerCompiler.h" - -#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" -#include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" -#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" -// #include "common/AbstractFactoryPattern/MainFactory.h" -#include "common/tools/LimaMainTaskRunner.h" -#include "common/MediaProcessors/MediaProcessUnit.h" -#include "common/MediaProcessors/MediaAnalysisDumper.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include "common/time/timeUtilsController.h" - -#include "linguisticProcessing/core/Automaton/recognizer.h" -#include "linguisticProcessing/core/Automaton/automatonReaderWriter.h" -#include "linguisticProcessing/core/Automaton/automatonCommon.h" // for exceptions -#include "common/LimaCommon.h" -#include "common/MediaticData/mediaticData.h" -#include -#include -#include -#include -#include - -#include - - -using namespace std; - -using namespace Lima; -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::LinguisticProcessing::Automaton; -using namespace Lima::LinguisticProcessing; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::Misc; - -//**************************************************************************** -// declarations -//**************************************************************************** -// help mode & usage -static const string USAGE("usage : compile-rules [-h] -ooutputfile rulesfile\n"); - -static const string HELP("A compiler for the rules of the Named Entities recognizer\n" - +USAGE - +"\n" -+"-h : this help page\n" -+"--output=file : name of the output file for the compiled rules\n" -// +"(or -ofile)\n" -+"\n" -+"--language=... : specify the language of the recognizer\n" -+"--modex=... : specify the name of the modex config file\n" -+"--pipeline=... : specify the name of the pipeline for the modex\n" -+"--configDir=... : specify the directory to find the config files (default is $LIMA_CONF)\n" -+"--resourcesDir=... : specify the directory to find the resources (default is $LIMA_RESOURCES)\n" -+"--common-config-file=... : = Optional. Default is lima-common.xml\n" -+"--lp-config-file=... : = Optional. Default is lima-analysis.xml\n" -+"--encoding=... : specify the encoding of the rules file\n" -+"--useDictionary : uses a dictionary to reorganize rules\n" -+"--debug : compiles in debug mode\n" -+"\n" -+"--listTriggers : list the triggers with the corresponding offest\n" -+"--bin (or -r) : read a binary file containing compiled rules : if \n" -+" the --listTriggers is not set, print the rules on stdout\n" -+"\n" -+"rulesfile is the name of the file containing the rules in plain text\n"); - -//**************************************************************************** -#define DEFAULT_COMMON_CONFIG "lima-common.xml" -#define DEFAULT_LP_CONFIG "lima-analysis.xml" -#define DEFAULT_ENCODING "utf8" - -//**************************************************************************** -// GLOBAL variable -> the command line arguments -struct Param -{ - string inputRulesFile; // name of the rules file - string outputFile; // name of the output file for the compiled rules - string resourcesDir; // directory for resources - string configDir; // directory for config files - string commonConfigFile; // config file for linguisticData - string lpConfigFile; // config file for linguistic processing - string modexConfigFile; // config file for modex - string pipeline; // pipeline for modex (defined in config file) - string language; // language of the files - string encoding; // default encoding of rules files - bool decompile; // reads compiled rules - bool listTriggers; // list the triggers with their associated index - bool useDictionary; // use a dictionary to reorganize rules - bool loadPossibleTypes;// force loading of possible types - bool debug; // compile in debug mode (store rule ids for debug purposes) - bool help; // help mode -} -param={"", - "", - "", - "", - DEFAULT_COMMON_CONFIG, - DEFAULT_LP_CONFIG, - "", - "", - "", - DEFAULT_ENCODING, - false, - false, - false, - false, - false, - false}; - -void readCommandLineArguments(uint64_t argc, char *argv[]) -{ -// bool languageSpecified(false); - for(uint64_t i(1); i= argc) - { - std::cerr << "no output filename given" << endl; - cerr << USAGE << endl; - exit(1); - } - else - { - param.outputFile = argv[i]; - } - } - } - else if (s.find("--output=",0)==0) - { - param.outputFile=s.substr(9,s.length()-9); - } - else if (s.find("--modex=",0)==0) - { - param.modexConfigFile=string(s,8); - } - else if (s.find("--pipeline=",0)==0) - { - param.pipeline=string(s,11); - } - else if (s.find("--debug",0)==0) - { - param.debug=true; - } - else if (s[0]=='-') - { - std::cerr << "unrecognized option " << s << endl; - cerr << USAGE << endl; - exit(1); - } - else - { // file names - param.inputRulesFile=s; - } - } - // if not specified, search default values in environment variables - if (param.resourcesDir.empty()) - { - char* resourcesStr = getenv("LIMA_RESOURCES"); - if (resourcesStr != NULL) - { - param.resourcesDir = resourcesStr; - } - else - { - param.resourcesDir = "/usr/share/apps/lima/resources/"; - } - } - if (param.configDir.empty()) - { - char* configStr = getenv("LIMA_CONF"); - if (configStr != NULL) - { - param.configDir = configStr; - } - else - { - param.configDir = "/usr/share/config/lima/"; - } - } - - //ensure all needed parameters are set - if (param.language.empty()) { - cerr << "Error: missing --language=.. argument " << endl; - exit(1); - } -// if (param.modexConfigFile.empty()) { -// cerr << "Error: missing --modex=.. argument " << endl; -// exit(1); -// } - -} - -std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, const std::string& pipeline); - -//**************************************************************************** -// M A I N -//**************************************************************************** -#include "common/tools/LimaMainTaskRunner.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include - -int run(int aargc,char** aargv); - -int main(int argc, char **argv) -{ - QCoreApplication a(argc, argv); - - // Task parented to the application so that it - // will be deleted by the application. - Lima::LimaMainTaskRunner* task = new Lima::LimaMainTaskRunner(argc, argv, run, &a); - - // This will cause the application to exit when - // the task signals finished. - QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); - - // This will run the task from the application event loop. - QTimer::singleShot(0, task, SLOT(run())); - - return a.exec(); - -} - - -int run(int argc,char** argv) -{ - QsLogging::initQsLog(); - //Lima::TimeUtilsController("run", true); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - - readCommandLineArguments(argc,argv); - - deque langs; - langs.push_back(param.language); - - // initialize linguisticData -// try - { - // initialize common - LOGINIT("Automaton::Compiler"); - LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ")..."; - MediaticData::changeable().init( - param.resourcesDir, - param.configDir, - param.commonConfigFile, - langs); - LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ") done!"; - - /* - * @TODO eviter l'initialisation des ressources dans compiles rules - * On est oblige d'initialiser les ressources, juste pour recuperer un - * Recognizer vide. Il faut pouvoir creer un Recognizer dans avoir a - * initialiser les ressources - */ - - // initialize linguistic processing resources - MediaId language = MediaticData::single().media(param.language); - - XMLConfigurationFileParser lpconfig(param.configDir + "/" + param.lpConfigFile); - const string& langConfigFile=lpconfig.getModuleGroupParamValue("lima-coreclient","mediaProcessingDefinitionFiles",param.language); - XMLConfigurationFileParser langParser(param.configDir + "/" + langConfigFile); - ModuleConfigurationStructure& module=langParser.getModuleConfiguration("Resources"); - LinguisticResources::changeable().initLanguage( - language, - module, - false); // don't load mainkeys in stringpool, no use - - AbstractResource* resReco = LinguisticResources::single().getResource(language,"automatonCompiler"); - - Recognizer& reco = *(static_cast< Recognizer* >(resReco)); - - // look at the modex config file to find the dynamic libraries that must be loaded - if (! param.modexConfigFile.empty()) { - LOGINIT("Automaton::Compiler"); - LDEBUG << "use modex file " << param.modexConfigFile; - XMLConfigurationFileParser modexconfig(param.configDir + "/" + param.modexConfigFile); - vector libraries=getDynamicLibraryNames(modexconfig,param.pipeline); - for (vector::const_iterator it=libraries.begin(),it_end=libraries.end();it!=it_end; it++) - { - LOGINIT("Automaton::Compiler"); - LDEBUG << "load library " << *it; - Common::DynamicLibrariesManager::changeable().loadLibrary(*it); - } - } - - //Recognizer reco; - // if the rules file is in binary format and we want to print its content - if (param.decompile) - { - try - { - //reco.readFromFile(param.inputRulesFile); - AutomatonReader reader; - reader.readRecognizer(param.inputRulesFile,reco); - - if (! param.listTriggers) - { - cout << reco; - } - } - catch (exception& e) - { - std::cerr << "Error while reading rules file: " << e.what() << endl; - exit(1); - } - } - else - { - // read the rules file in text format - //try - { - // Lima::TimeUtilsController *ctrl2 = new Lima::TimeUtilsController("read file and build recognizer", true); - // Lima::TimeUtilsController("read file and build recognizer", true); - std::cerr << "\rBuilding recognizer…"; - RecognizerCompiler::setRecognizerEncoding(param.encoding); - RecognizerCompiler compiler(param.inputRulesFile); - compiler.buildRecognizer(reco,language); - // delete ctrl2; - } - /*catch (exception& e) - { - std::cerr << "recognizer construction failed:"<< e.what() << endl; - exit(1); - }*/ - - // if we want to use a dictionary to reorganize rules - if (param.useDictionary) - { - // Lima::TimeUtilsController("useDictionary", true); - try - { - - string dicostr = "mainDictionary"; - AbstractResource* res= LinguisticResources::single().getResource(language,dicostr); - - AnalysisDict::AbstractAnalysisDictionary* dico = static_cast< AnalysisDict::AbstractAnalysisDictionary* >(res); - if (dico==0) - { - throw runtime_error("dictionary not available for language "+ - param.language); - } - // Reorganization not available - // reco.reorganizeRules(*dico); - } - // when character is searched out of text buffer - catch (std::exception& e) { - std::cerr << "Error: " << e.what() << endl; - } - } - - // write recognizer to file - try - { - if (! param.outputFile.empty()) - { - std::cerr << "\rWriting recognizer…"; - AutomatonWriter writer; - writer.writeRecognizer(reco,param.outputFile,language,param.debug); - //reco.writeToFile(param.outputFile); - } - } - catch (Lima::LinguisticProcessing::Automaton::OpenFileException& e) - { - std::cerr << "OpenFileException: " << e.what() << endl; exit(1); - } - } - - if (param.listTriggers) - { - reco.listTriggers(); - } - - } -// catch (InvalidConfiguration& e) -// { -// std::cerr << "Caught InvalidConfiguration: " << e.what() << std::endl; -// throw e; -// } -// catch (NoSuchModule &) -// { -// std::cerr << e.what() << std::endl; -// } -// catch (NoSuchGroup& e) -// { -// std::cerr << e.what() << std::endl; -// } -// catch (NoSuchParam& ) -// { -// std::cerr << e.what() << std::endl; -// } - TIMELOGINIT; - TimeUtils::logAllCumulatedTime("And at last"); - - - return EXIT_SUCCESS; -} - -//----------------------------------------------------------------------------------------------- -//----------------------------------------------------------------------------------------------- -void addLibs(GroupConfigurationStructure& group, - std::vector& libNames) -{ - try { - std::string libs=group.getAttribute("lib"); - std::string::size_type begin=0; - std::string::size_type i=libs.find(",",begin); - while (i!=std::string::npos) { - libNames.push_back(string(libs,begin,i-begin)); - begin=i+1; - i=libs.find(",",begin); - } - libNames.push_back(string(libs,begin)); - } - catch (NoSuchAttribute& ) {} // do nothing: optional -} - -std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, - const std::string& pipeline) -{ - vector libNames; - try { - ModuleConfigurationStructure& module=parser.getModuleConfiguration("Processors"); - - if (! pipeline.empty()) { - // search libs for given pipeline - try { - GroupConfigurationStructure group=module.getGroupNamed(pipeline); - addLibs(group,libNames); - // do it for all groups included in pipeline - deque& processUnits=group.getListsValueAtKey("processUnitSequence"); - for (deque::const_iterator it=processUnits.begin(),it_end=processUnits.end(); it!=it_end; it++) - { - try { - GroupConfigurationStructure pu=module.getGroupNamed(*it); - addLibs(pu,libNames); - // @todo: should be recursive - } - catch (NoSuchGroup) {} // missing group for processUnit in pipeline : ignored - } - return libNames; - } - catch (NoSuchGroup) { - cerr << "Warning: config file for modex has no group '" << pipeline << "' in 'Processors' : ignored" << endl; - } - catch (NoSuchList) {} // no processUnitSequence list : ignored - } - - // if no pipeline specified, go through all groups - for (ModuleConfigurationStructure::iterator it=module.begin(), - it_end=module.end(); it!=it_end; it++) - { - // ModuleConfigurationStructure is a map - addLibs((*it).second,libNames); - } - } - catch (NoSuchModule &) { - cerr << "Error: config file for modex has no module 'Processors'" << endl; - } - - return libNames; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/****************************************************************************** +* +* File : compile-rules.cpp +* Author : Besancon Romaric (besanconr@zoe.cea.fr) +* Created on : Fri Oct 25 2002 +* Copyright : (c) 2002 by CEA +* Version : $Id$ +* +******************************************************************************/ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "compilerExceptions.h" +#include "libautomatonCompiler/recognizerCompiler.h" + +#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" +#include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" +#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" +// #include "common/AbstractFactoryPattern/MainFactory.h" +#include "common/tools/LimaMainTaskRunner.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "common/MediaProcessors/MediaAnalysisDumper.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" + +#include "linguisticProcessing/core/Automaton/recognizer.h" +#include "linguisticProcessing/core/Automaton/automatonReaderWriter.h" +#include "linguisticProcessing/core/Automaton/automatonCommon.h" // for exceptions +#include "common/LimaCommon.h" +#include "common/MediaticData/mediaticData.h" +#include +#include +#include +#include +#include + +#include + + +using namespace std; + +using namespace Lima; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::LinguisticProcessing::Automaton; +using namespace Lima::LinguisticProcessing; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::Misc; + +//**************************************************************************** +// declarations +//**************************************************************************** +// help mode & usage +static const string USAGE("usage : compile-rules [-h] -ooutputfile rulesfile\n"); + +static const string HELP("A compiler for the rules of the Named Entities recognizer\n" + +USAGE + +"\n" ++"-h : this help page\n" ++"--output=file : name of the output file for the compiled rules\n" +// +"(or -ofile)\n" ++"\n" ++"--language=... : specify the language of the recognizer\n" ++"--modex=... : specify the name of the modex config file\n" ++"--pipeline=... : specify the name of the pipeline for the modex\n" ++"--configDir=... : specify the directory to find the config files (default is $LIMA_CONF)\n" ++"--resourcesDir=... : specify the directory to find the resources (default is $LIMA_RESOURCES)\n" ++"--common-config-file=... : = Optional. Default is lima-common.xml\n" ++"--lp-config-file=... : = Optional. Default is lima-analysis.xml\n" ++"--encoding=... : specify the encoding of the rules file\n" ++"--useDictionary : uses a dictionary to reorganize rules\n" ++"--debug : compiles in debug mode\n" ++"\n" ++"--listTriggers : list the triggers with the corresponding offest\n" ++"--bin (or -r) : read a binary file containing compiled rules : if \n" ++" the --listTriggers is not set, print the rules on stdout\n" ++"\n" ++"rulesfile is the name of the file containing the rules in plain text\n"); + +//**************************************************************************** +#define DEFAULT_COMMON_CONFIG "lima-common.xml" +#define DEFAULT_LP_CONFIG "lima-analysis.xml" +#define DEFAULT_ENCODING "utf8" + +//**************************************************************************** +// GLOBAL variable -> the command line arguments +struct Param +{ + string inputRulesFile; // name of the rules file + string outputFile; // name of the output file for the compiled rules + string resourcesDir; // directory for resources + string configDir; // directory for config files + string commonConfigFile; // config file for linguisticData + string lpConfigFile; // config file for linguistic processing + string modexConfigFile; // config file for modex + string pipeline; // pipeline for modex (defined in config file) + string language; // language of the files + string encoding; // default encoding of rules files + bool decompile; // reads compiled rules + bool listTriggers; // list the triggers with their associated index + bool useDictionary; // use a dictionary to reorganize rules + bool loadPossibleTypes;// force loading of possible types + bool debug; // compile in debug mode (store rule ids for debug purposes) + bool help; // help mode +} +param={"", + "", + "", + "", + DEFAULT_COMMON_CONFIG, + DEFAULT_LP_CONFIG, + "", + "", + "", + DEFAULT_ENCODING, + false, + false, + false, + false, + false, + false}; + +void readCommandLineArguments(uint64_t argc, char *argv[]) +{ +// bool languageSpecified(false); + for(uint64_t i(1); i= argc) + { + std::cerr << "no output filename given" << endl; + cerr << USAGE << endl; + exit(1); + } + else + { + param.outputFile = argv[i]; + } + } + } + else if (s.find("--output=",0)==0) + { + param.outputFile=s.substr(9,s.length()-9); + } + else if (s.find("--modex=",0)==0) + { + param.modexConfigFile=string(s,8); + } + else if (s.find("--pipeline=",0)==0) + { + param.pipeline=string(s,11); + } + else if (s.find("--debug",0)==0) + { + param.debug=true; + } + else if (s[0]=='-') + { + std::cerr << "unrecognized option " << s << endl; + cerr << USAGE << endl; + exit(1); + } + else + { // file names + param.inputRulesFile=s; + } + } + + //ensure all needed parameters are set + if (param.language.empty()) { + cerr << "Error: missing --language=.. argument " << endl; + exit(1); + } + +} + +std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, const std::string& pipeline); + +//**************************************************************************** +// M A I N +//**************************************************************************** +#include "common/tools/LimaMainTaskRunner.h" +#ifdef ANTINNO_SPECIFIC +#include "common/AbstractFactoryPattern/antinno.LibraryLoader.class.h" +#else +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#endif +#include + +int run(int aargc,char** aargv); + +int main(int argc, char **argv) +{ + QCoreApplication a(argc, argv); + + // Task parented to the application so that it + // will be deleted by the application. + Lima::LimaMainTaskRunner* task = new Lima::LimaMainTaskRunner(argc, argv, run, &a); + + // This will cause the application to exit when + // the task signals finished. + QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); + + // This will run the task from the application event loop. + QTimer::singleShot(0, task, SLOT(run())); + + return a.exec(); + +} + + +int run(int argc,char** argv) +{ + readCommandLineArguments(argc,argv); + + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + if (!param.configDir.empty()) + { + configPath = QString::fromUtf8(param.configDir.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + if (!param.resourcesDir.empty()) + { + resourcesPath = QString::fromUtf8(param.resourcesDir.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } +#ifdef ANTINNO_SPECIFIC + + + + + { + std::string configDir; + + if (param.configDir.empty()) + { + if ((::std::getenv("AMOSE_CONF")) == NULL) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + else + { + configDir = ::std::getenv("AMOSE_CONF"); + } + } + else + { + configDir = param.configDir; + } + + try + { + ::std::string const file = configDir + "/plugins.txt"; + Lima::antinno::LibraryLoader().loadFromFile(file); + } + catch (::std::exception const& ex) + { + std::cerr << "Exception during plugins loading. " << ex.what() << std::endl; + return EXIT_FAILURE; + } + + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + //QsLogging::antinno::log = pLog1; + QsLogging::antinno::log = pLog1; + if (!QsLogging::Categories::instance().configure(log4cppFilePath.data())) + { + std::cerr << "Configure Problem " << log4cppFilePath << std::endl; + return EXIT_FAILURE; + } + + ::std::cout << "Plugins initialized" << ::std::endl; + } +#else + QsLogging::initQsLog(configPath); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); +#endif + + + + deque langs; + langs.push_back(param.language); + + // initialize linguisticData +// try + { + // initialize common + LOGINIT("Automaton::Compiler"); + LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ")..."; + MediaticData::changeable().init( + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), + param.commonConfigFile, + langs); + LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ") done!"; + + /* + * @TODO eviter l'initialisation des ressources dans compiles rules + * On est oblige d'initialiser les ressources, juste pour recuperer un + * Recognizer vide. Il faut pouvoir creer un Recognizer dans avoir a + * initialiser les ressources + */ + + // initialize linguistic processing resources + MediaId language = MediaticData::single().media(param.language); + + bool languageInitialized = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + param.lpConfigFile.c_str()).exists()) + { + XMLConfigurationFileParser lpconfig((configDir + "/" + param.lpConfigFile.c_str()).toUtf8().constData()); + const string& langConfigFile=lpconfig.getModuleGroupParamValue("lima-coreclient","mediaProcessingDefinitionFiles",param.language); + XMLConfigurationFileParser langParser((configDir + "/" + langConfigFile.c_str()).toUtf8().constData()); + ModuleConfigurationStructure& module=langParser.getModuleConfiguration("Resources"); + LinguisticResources::changeable().initLanguage( + language, + module, + false); // don't load mainkeys in stringpool, no use + languageInitialized = true; + } + } + if(!languageInitialized) + { + LOGINIT("Automaton::Compiler"); + LERROR << "No language was configured configured with" << configDirs + << "and" << param.lpConfigFile.c_str(); + return EXIT_FAILURE; + } + + AbstractResource* resReco = LinguisticResources::single().getResource(language,"automatonCompiler"); + + Recognizer& reco = *(static_cast< Recognizer* >(resReco)); + + // look at the modex config file to find the dynamic libraries that must be loaded + if (! param.modexConfigFile.empty()) { + LOGINIT("Automaton::Compiler"); + LDEBUG << "use modex file " << param.modexConfigFile; + bool modexInitialized = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + param.modexConfigFile.c_str()).exists()) + { + XMLConfigurationFileParser modexconfig((configDir + "/" + param.modexConfigFile.c_str()).toUtf8().constData()); + vector libraries=getDynamicLibraryNames(modexconfig,param.pipeline); + for (vector::const_iterator it=libraries.begin(),it_end=libraries.end();it!=it_end; it++) + { + LOGINIT("Automaton::Compiler"); + LDEBUG << "load library " << *it; + Common::DynamicLibrariesManager::changeable().loadLibrary(*it); + } + modexInitialized = true; + } + } + if(!modexInitialized) + { + LOGINIT("Automaton::Compiler"); + LERROR << "No modex plugin was loaded with" << configDirs + << "and" << param.modexConfigFile.c_str(); + return EXIT_FAILURE; + } + } + //Recognizer reco; + // if the rules file is in binary format and we want to print its content + if (param.decompile) + { + try + { + //reco.readFromFile(param.inputRulesFile); + AutomatonReader reader; + reader.readRecognizer(param.inputRulesFile,reco); + + if (! param.listTriggers) + { + cout << reco; + } + } + catch (exception& e) + { + std::cerr << "Error while reading rules file: " << e.what() << endl; + exit(1); + } + } + else + { + // read the rules file in text format + //try + { + // Lima::TimeUtilsController *ctrl2 = new Lima::TimeUtilsController("read file and build recognizer", true); + // Lima::TimeUtilsController("read file and build recognizer", true); + std::cerr << "\rBuilding recognizer"; + RecognizerCompiler::setRecognizerEncoding(param.encoding); + RecognizerCompiler compiler(param.inputRulesFile); + compiler.buildRecognizer(reco,language); + // delete ctrl2; + } + /*catch (exception& e) + { + std::cerr << "recognizer construction failed:"<< e.what() << endl; + exit(1); + }*/ + + // if we want to use a dictionary to reorganize rules + if (param.useDictionary) + { + // Lima::TimeUtilsController("useDictionary", true); + try + { + + string dicostr = "mainDictionary"; + AbstractResource* res= LinguisticResources::single().getResource(language,dicostr); + + AnalysisDict::AbstractAnalysisDictionary* dico = static_cast< AnalysisDict::AbstractAnalysisDictionary* >(res); + if (dico==0) + { + throw runtime_error("dictionary not available for language "+ + param.language); + } + // Reorganization not available + // reco.reorganizeRules(*dico); + } + // when character is searched out of text buffer + catch (std::exception& e) { + std::cerr << "Error: " << e.what() << endl; + } + } + + // write recognizer to file + try + { + if (! param.outputFile.empty()) + { + std::cerr << "\rWriting recognizer"; + AutomatonWriter writer; + LINFO << "writer.WritingRecognizer(language:" << language << "debug:" << param.debug << ")"; + writer.writeRecognizer(reco,param.outputFile,language,param.debug); + //reco.writeToFile(param.outputFile); + } + } + catch (Lima::LinguisticProcessing::Automaton::OpenFileException& e) + { + std::cerr << "OpenFileException: " << e.what() << endl; exit(1); + } + } + + if (param.listTriggers) + { + reco.listTriggers(); + } + + } +// catch (InvalidConfiguration& e) +// { +// std::cerr << "Caught InvalidConfiguration: " << e.what() << std::endl; +// throw e; +// } +// catch (NoSuchModule &) +// { +// std::cerr << e.what() << std::endl; +// } +// catch (NoSuchGroup& e) +// { +// std::cerr << e.what() << std::endl; +// } +// catch (NoSuchParam& ) +// { +// std::cerr << e.what() << std::endl; +// } + TIMELOGINIT; + TimeUtils::logAllCumulatedTime("And at last"); + + + return EXIT_SUCCESS; +} + +//----------------------------------------------------------------------------------------------- +//----------------------------------------------------------------------------------------------- +void addLibs(GroupConfigurationStructure& group, + std::vector& libNames) +{ + try { + std::string libs=group.getAttribute("lib"); + std::string::size_type begin=0; + std::string::size_type i=libs.find(",",begin); + while (i!=std::string::npos) { + libNames.push_back(string(libs,begin,i-begin)); + begin=i+1; + i=libs.find(",",begin); + } + libNames.push_back(string(libs,begin)); + } + catch (NoSuchAttribute& ) {} // do nothing: optional +} + +std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, + const std::string& pipeline) +{ + vector libNames; + try { + ModuleConfigurationStructure& module=parser.getModuleConfiguration("Processors"); + + if (! pipeline.empty()) { + // search libs for given pipeline + try { + GroupConfigurationStructure group=module.getGroupNamed(pipeline); + addLibs(group,libNames); + // do it for all groups included in pipeline + deque& processUnits=group.getListsValueAtKey("processUnitSequence"); + for (deque::const_iterator it=processUnits.begin(),it_end=processUnits.end(); it!=it_end; it++) + { + try { + GroupConfigurationStructure pu=module.getGroupNamed(*it); + addLibs(pu,libNames); + // @todo: should be recursive + } + catch (NoSuchGroup) {} // missing group for processUnit in pipeline : ignored + } + return libNames; + } + catch (NoSuchGroup) { + cerr << "Warning: config file for modex has no group '" << pipeline << "' in 'Processors' : ignored" << endl; + } + catch (NoSuchList) {} // no processUnitSequence list : ignored + } + + // if no pipeline specified, go through all groups + for (ModuleConfigurationStructure::iterator it=module.begin(), + it_end=module.end(); it!=it_end; it++) + { + // ModuleConfigurationStructure is a map + addLibs((*it).second,libNames); + } + } + catch (NoSuchModule &) { + cerr << "Error: config file for modex has no module 'Processors'" << endl; + } + + return libNames; +} diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp index e0bd6f31c..26970e184 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp @@ -27,6 +27,7 @@ ************************************************************************/ #include "automatonCompiler.h" +#include "gazeteer.h" #include "transitionCompiler.h" #include "compilerExceptions.h" #include "linguisticProcessing/core/Automaton/automatonCommon.h" @@ -49,6 +50,7 @@ namespace AutomatonCompiler { /***********************************************************************/ Automaton buildAutomaton(const AutomatonString& automatonString, MediaId language, + const std::vector& gazeteers, SearchGraphSense sense, const std::vector& activeEntityGroups) { AUCLOGINIT; @@ -65,7 +67,7 @@ Automaton buildAutomaton(const AutomatonString& automatonString, else { // LDEBUG << "automatonString is: " << automatonString; Tstate finalState=buildAutomaton(a,automatonString, - initialState,currentId,language, + initialState,currentId,language,gazeteers, activeEntityGroups); // Lima::TimeUtilsController* ctrlAF = new Lima::TimeUtilsController("make final", true); // LDEBUG << "final state is " << finalState; @@ -111,6 +113,7 @@ Tstate buildAutomaton(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups) { #ifdef DEBUG_LP @@ -133,7 +136,7 @@ Tstate buildAutomaton(Automaton& a, // TODO: check if we have to handle modifiers of numbering like first, next and last in currentId while (min > 0) { // must be there x times -> insert it as non-optional - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); min--; if (max != AutomatonString::INFINITE_OCC) { max--; @@ -148,12 +151,12 @@ Tstate buildAutomaton(Automaton& a, // add the epsilon-transition from first to last (for minOcurrences=0) // and insert again the automaton from last to first // (to avoid epsilon-cycles) - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(optInitialState,finalState,new EpsilonTransition()); Tstate tmpFinalState(finalState); Tstate tmpReturnState = buildAutomatonNotOptional(a,automatonString, - tmpFinalState,currentId,language,activeEntityGroups); + tmpFinalState,currentId,language,gazeteers,activeEntityGroups); //a.addTransition(tmpReturnState,optInitialState,new EpsilonTransition()); a.addTransition(tmpReturnState,finalState,new EpsilonTransition()); @@ -168,7 +171,7 @@ Tstate buildAutomaton(Automaton& a, // insert it as non-optional as many times as necessary // and add the epsilon-transition while (max > 0) { - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(optInitialState,finalState,new EpsilonTransition()); max--; } @@ -176,7 +179,7 @@ Tstate buildAutomaton(Automaton& a, return finalState; } else { - return buildAutomatonNotOptional(a,automatonString,initialState,currentId,language,activeEntityGroups); + return buildAutomatonNotOptional(a,automatonString,initialState,currentId,language,gazeteers,activeEntityGroups); } } @@ -184,6 +187,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& initialId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups) { #ifdef DEBUG_LP @@ -204,7 +208,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, it=automatonString.getParts().begin(), it_end=automatonString.getParts().end(); for (; it!=it_end; it++) { - Tstate altFinalState=buildAutomaton(a,*it,initialState,currentId,language,activeEntityGroups); + Tstate altFinalState=buildAutomaton(a,*it,initialState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(altFinalState,finalState,new EpsilonTransition()); // id??? } return finalState; @@ -224,7 +228,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, for (; it!=it_end; it++, subCount++) { std::string currentId(initialId); currentId.append(".").append(std::to_string(static_cast(subCount))); - seqfinalState=buildAutomaton(a,*it,seqInitialState,currentId,language,activeEntityGroups); + seqfinalState=buildAutomaton(a,*it,seqInitialState,currentId,language,gazeteers,activeEntityGroups); seqInitialState=seqfinalState; } return seqfinalState; @@ -234,7 +238,11 @@ Tstate buildAutomatonNotOptional(Automaton& a, #ifdef DEBUG_LP LDEBUG << "is unit "; #endif - TransitionUnit *t = createTransition(automatonString,language,initialId,activeEntityGroups); + TransitionUnit* t; +#ifdef DEBUG_LP + LDEBUG << "buildAutomatonNotOptional: createSimpleTransition from " << automatonString.getString(); +#endif + t = createTransition(automatonString,language,initialId,activeEntityGroups); if (t != 0) { Tstate finalState = a.addState(); a.addTransition(initialState, finalState, t); @@ -244,6 +252,48 @@ Tstate buildAutomatonNotOptional(Automaton& a, throw AutomatonErrorException("attempt to insert empty transition\n"); } } + // We do not yet know how to use gazetteer with any element defined with a category or with space chrecter + else if (automatonString.isSimpleGazeteer()) { +#ifdef DEBUG_LP + LDEBUG << "is simpleGazeteer "; +#endif + const LimaString& unitString = automatonString.getUnitString(); + const LimaString& gazeteerName = unitString.mid(1,unitString.size()-1); +// OME LimaString gazeteerName = automatonString.getUnitString().mid(1,automatonString.getString().size()-1); +// OME int i; +// OME for (i=0; i gazeteerAsVectorOfString = gazeteer; +// OME #ifdef DEBUG_LP +// OME LDEBUG << "buildAutomatonNotOptional: new GazeteerTransition from " << gazeteer.alias(); +// OME #endif + // t = createGazeteerTransition(automatonString,language,initialId,activeEntityGroups,gazeteerAsVectorOfString,true); + // DONE?: replace new GazeteerTransition by createTransition.... + // t = new GazeteerTransition(gazeteerAsVectorOfString,gazeteer.alias(),true); + // TransitionUnit* trigger = new GazeteerTransition(gazeteerAsVectorOfString,gazeteerName,keepTrigger); */ + // TODO, vrifier que + // - grer les "constraints" + TransitionUnit* t = createGazeteerTransition(gazeteerName, + language, initialId, activeEntityGroups, + gazeteers,automatonString.isKept(),false); + + if (t != 0) { + const std::vector& constraints = automatonString.getConstraints(); + for (std::size_t i(0); iaddConstraint(constraints[i]); + } + Tstate finalState = a.addState(); + a.addTransition(initialState, finalState, t); + return finalState; + } + else { + throw AutomatonErrorException("attempt to insert empty transition\n"); + } + } return initialState; } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h index c934cd27b..dec576eed 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h @@ -46,6 +46,7 @@ namespace AutomatonCompiler { // use directly automaton string LIMA_AUTOMATONCOMPILER_EXPORT Automaton buildAutomaton(const AutomatonString& automatonString, MediaId language, + const std::vector& gazeteers, SearchGraphSense sense, const std::vector& activeEntityGroups); @@ -53,12 +54,14 @@ namespace AutomatonCompiler { const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups); LIMA_AUTOMATONCOMPILER_EXPORT Tstate buildAutomatonNotOptional(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups); } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp index 96ede9264..d4b4e5319 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp @@ -717,6 +717,10 @@ void AutomatonString::removeArtificialSequences(const bool inSubPart) { // LDEBUG << "trying to remove : " << (*part).getString(); part=m_parts.erase(part); // LDEBUG << "has erased artificial : " << getString(); +#ifdef ANTINNO_BUGFIX + // spcique lib std microsoft + if (part != m_parts.begin()) +#endif part--; // erase returns iterator following the one erased } // else if (inSubPart) { @@ -968,10 +972,26 @@ void AutomatonString::parseUnit(const LimaString& str, oss << "unknown class " << Common::Misc::limastring2utf8stdstring(str.mid(newBegin+1,newSize-1)); throw AutomatonCompilerException(oss.str()); } - // copy only type, parts and unit (other are set by modifiers) - setType((*it).getAutomatonString().getType()); - m_parts=(*it).getAutomatonString().getParts(); - m_unit=(*it).getAutomatonString().getUnitString(); + const Gazeteer& gazeteer = *it; + //if( !gazeteer.hasMultiTermWord() && gazeteer.hasNoCategoryNorTstatus() ) { + if( gazeteer.hasNoCategoryNorTstatus() ) { +#ifdef DEBUG_LP + LDEBUG << "AutomatonString: set type(SIMPLE_GAZETEER)"; +#endif + setType(SIMPLE_GAZETEER); + // m_parts is empty!; + // m_unit=gazeteer.getName(); + m_unit=str.mid(newBegin,newSize); + } + else { + // copy only type, parts and unit (other are set by modifiers) +#ifdef DEBUG_LP + LDEBUG << "AutomatonString: set type(" << (*it).getAutomatonString().getType() << ")"; +#endif + setType((*it).getAutomatonString().getType()); + m_parts=(*it).getAutomatonString().getParts(); + m_unit=(*it).getAutomatonString().getUnitString(); + } } else if (str[newBegin] == CHAR_BEGIN_NAMESUB) { #ifdef DEBUG_LP @@ -1062,6 +1082,9 @@ LimaString AutomatonString::getString() const { case UNIT: { return applyModifiers(m_unit); } + case SIMPLE_GAZETEER: { + return applyModifiers(m_unit); + } case SEQUENCE: { LimaString str; if (m_parts.size()) { diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h index da50f2d46..87e45d67e 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h @@ -54,6 +54,7 @@ typedef enum { UNKNOWN_TYPE, UNIT, SEQUENCE, + SIMPLE_GAZETEER, ALTERNATIVE } ElementType; @@ -125,6 +126,7 @@ class AutomatonString bool isUnit() const; bool isSequence() const; bool isAlternative() const; + bool isSimpleGazeteer() const; bool isArtificialSequence() const; //only for construction bool isSplittedFirst() const { return m_isSplittedFirst; } @@ -346,6 +348,9 @@ inline bool AutomatonString::isArtificialSequence() const { inline bool AutomatonString::isUnit() const { return (m_type == UNIT); } +inline bool AutomatonString::isSimpleGazeteer() const { + return (m_type == SIMPLE_GAZETEER); +} inline bool AutomatonString::isSequence() const { return (m_type == SEQUENCE); } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp index 53ba484d0..50c4251de 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp @@ -51,13 +51,18 @@ namespace Automaton { Gazeteer::Gazeteer(): std::vector(0), m_alias(), -m_automatonString() { +m_automatonString(), +m_hasMultiTermWord(false), +m_hasNoCategoryNorTstatus(true) +{ } Gazeteer::Gazeteer(const Gazeteer& g): std::vector(g), m_alias(g.m_alias), -m_automatonString(g.m_automatonString) +m_automatonString(g.m_automatonString), +m_hasMultiTermWord(g.m_hasMultiTermWord), +m_hasNoCategoryNorTstatus(g.m_hasNoCategoryNorTstatus) { } @@ -75,6 +80,8 @@ Gazeteer& Gazeteer::operator = (const Gazeteer& g) { std::vector::operator=(g); m_alias = g.alias(); m_automatonString=g.m_automatonString; + m_hasMultiTermWord=g.m_hasMultiTermWord; + m_hasNoCategoryNorTstatus=g.m_hasNoCategoryNorTstatus; } return (*this); } @@ -88,6 +95,23 @@ Gazeteer& Gazeteer::add(const Gazeteer& g) { } /***********************************************************************/ +/***********************************************************************/ +// add a word in the inherited std::vector +// check if word is simple word (no category, no Tstatus) +/***********************************************************************/ +void Gazeteer::addWord(const LimaString& s) { + if( (s.startsWith(STRING_TSTATUS_TR)) + || (s.startsWith(STRING_TSTATUS_TR_small)) + || (s.contains(CHAR_POS_TR)) ) + { + resetCategoryOrTstatusFlag(); + } + if( s.contains(CHAR_SEP_RE) ) { + setHasMultiTermWordFlag(); + } + push_back(s); +} + /***********************************************************************/ // build the automatonString corresponding to the gazeteer diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h index 8702e2229..54c478454 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h @@ -72,8 +72,15 @@ class Gazeteer : public std::vector LimaString readName(RecognizerCompiler& reco); void readValues(RecognizerCompiler& reco, const LimaString& stringBegin=LimaString()); + bool hasMultiTermWord() const { return m_hasMultiTermWord; } + bool hasNoCategoryNorTstatus() const { return m_hasNoCategoryNorTstatus; } + void resetCategoryOrTstatusFlag() { m_hasNoCategoryNorTstatus = false; } + void setHasMultiTermWordFlag() { m_hasMultiTermWord = true; } + private: LimaString m_alias; + bool m_hasMultiTermWord; + bool m_hasNoCategoryNorTstatus; AutomatonString m_automatonString; }; @@ -81,7 +88,6 @@ class Gazeteer : public std::vector // inline access functions /***********************************************************************/ inline uint64_t Gazeteer::numberOfWords() const { return size(); } -inline void Gazeteer::addWord(const LimaString& s) { push_back(s); } inline const LimaString& Gazeteer::alias() const { return m_alias; } inline void Gazeteer::setAlias(const LimaString& a) { m_alias = a; } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp index e98a7cb53..2e237bb31 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp @@ -33,6 +33,7 @@ #include "tstring.h" #include "common/LimaCommon.h" #include "common/Data/strwstrtools.h" +#include "common/tools/FileUtils.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/time/timeUtilsController.h" @@ -66,7 +67,7 @@ m_stream(0), m_nbRule(0) { AUCLOGINIT; - LINFO << "Opening recognizer compiler with file " << filename; + LDEBUG << "Opening recognizer compiler with file " << filename; m_stream=new ifstream(filename.c_str(), std::ifstream::binary); if (! m_stream || !m_stream->good()) { LERROR << "Cannot open file [" << filename << "]"; @@ -176,9 +177,9 @@ void RecognizerCompiler::buildRecognizer(Recognizer& reco, next=findSpecialCharacter(s,CHAR_SEP_LIST,begin); LimaString str = s.mid(begin,(next==-1)?next:next-begin); // initialize entities - string filename=Common::MediaticData::MediaticData::single().getConfigPath()+"/"+ - Misc::limastring2utf8stdstring(str); - XMLConfigurationFiles::XMLConfigurationFileParser parser(filename); + + QString filename = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),str); + XMLConfigurationFiles::XMLConfigurationFileParser parser(filename.toUtf8().constData()); MediaticData::MediaticData::changeable().initEntityTypes(parser); begin=next+1; } while (next != -1); @@ -377,9 +378,13 @@ void RecognizerCompiler::buildRecognizer(Recognizer& reco, LERROR << message.str(); } */ - LINFO << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" + LDEBUG << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" << ": trigger=" << *trigger; reco.addRule(trigger,r); +#ifdef DEBUG_LP + LDEBUG << "rule[" << m_nbRule << "]=" << *r; +#endif + m_nbRule++; delete trigger; } @@ -465,7 +470,11 @@ readSubAutomaton(const LimaString& line, } //********************************************************************** -// add a rule with a gazeteer trigger -> multiply the rules +// add a rule with a gazeteer trigger -> +// 1) create a rule and multiply the reference to this rule in the index +// of recognizer (transition with 1 entry,rule) +// 2) create a a gazeteerTransition and create only one entry in the index +// of recognizer (gazeteerTransition,rule) //********************************************************************** void RecognizerCompiler:: addRuleWithGazeteerTrigger(const LimaString& gazeteerName, @@ -479,64 +488,82 @@ addRuleWithGazeteerTrigger(const LimaString& gazeteerName, const bool headTrigger) { AUCLOGINIT; - // Lima::TimeUtilsController* ctrl4 = new Lima::TimeUtilsController("addRuleWithGazeteerTrigger", true); - // identify class alias -// int endTrigger(findSpecialCharacter(s,CHAR_SEP_RULE,1)); -// Tword classAlias(s.mid(1,endTrigger-1)); -// s=s.mid(endTrigger+1); // find gazeteer - // Lima::TimeUtilsController* ctrl41 = new Lima::TimeUtilsController("before init Rule inside addRuleWithGazeteerTrigger", true); - std::size_t i; - for (i=0; i0 ) { - // the class has been found - // only one rule and all triggers point to this rule - Rule* r=new Rule; - - //expandGazeteersInRule(ruleString,gazeteers); - //expandSubAutomatonsInRule(ruleString,subAutomatons); - - // check if there are agreement constraints on following lines - // and add them at end of the rule if there are - ruleString=ruleString+peekConstraints(*m_stream); - ruleString=ruleString+defaultAction; - - // add the trigger to deal with agreement constraints - LimaString triggerString=gazeteers[i][0]; - if (! keepTrigger) { - triggerString=CHAR_NOKEEP_OPEN_RE+triggerString+CHAR_NOKEEP_CLOSE_RE; + // gazeteer not found + if ( gazeteerIndex >= gazeteers.size() || gazeteers[gazeteerIndex].size() == 0 ) { + string str=Misc::limastring2utf8stdstring(gazeteerName); + if (gazeteerIndexsetWeight(currentRuleWeight()); - LINFO << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" - << ": multiple trigger (first is "<setWeight(currentRuleWeight()); + LDEBUG << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" + << ": multiple trigger (first is "<& gazeteerAsVectorOfString = gazeteer; + // TransitionUnit* trigger = new GazeteerTransition(gazeteerAsVectorOfString,gazeteerName,keepTrigger); */ + TransitionUnit* trigger = createGazeteerTransition(gazeteerName, + language, currentId, m_activeEntityGroups, + gazeteers,keepTrigger,headTrigger); + if (trigger != 0) + { + //copy the properties of the trigger of the rule + trigger->copyProperties(*(r->getTrigger())); + reco.addRule(trigger,indexRule); + //LINFO << nbRule << ": trigger=" << *trigger; + delete trigger; // it has been copied + } + } + else + { + for (std::size_t j(0); j& activeEntityGroups, + const vector& gazeteers, + const bool keep, + const bool head) +{ + int gazeteerIndex; + for (gazeteerIndex=0; gazeteerIndex= gazeteers.size() || gazeteers[gazeteerIndex].size() == 0 ) { + AUCLOGINIT; + string str=Misc::limastring2utf8stdstring(gazeteerName); + if (gazeteerIndex& gazeteerAsVectorOfString = gazeteer; + // TODO bool negative = automatonString.isNegative()??, Est-ce qu'on autorise un trigger avec une ngation? + bool negative(false); + TransitionUnit* t = new GazeteerTransition(gazeteerAsVectorOfString, gazeteerName, keep); + t->setNegative(negative); + t->setHead(head); + t->setId(id); + return t; +} + TransitionUnit* createTransition(const AutomatonString& automatonString, MediaId language, const std::string& id, @@ -83,7 +124,9 @@ TransitionUnit* createTransition(const LimaString str, const std::vector& activeEntityGroups, const bool keep, const bool neg, - const std::vector& constraints) + const std::vector& constraints, + const std::vector& gazeteerAsVectorOfString + ) { #ifdef DEBUG_LP AUCLOGINIT; @@ -99,7 +142,7 @@ TransitionUnit* createTransition(const LimaString str, FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(language); #ifdef DEBUG_LP - LDEBUG << "creating transition from string [" + LDEBUG << "createTransition: creating transition from string [" << Common::Misc::limastring2utf8stdstring(str) << "] with id" << id; #endif @@ -143,7 +186,7 @@ TransitionUnit* createTransition(const LimaString str, if (s[0] == CHAR_NOKEEP_OPEN_TR) { if (s[s.length()-1] != CHAR_NOKEEP_CLOSE_TR) { AUCLOGINIT; - LERROR << "confused by no_keep format (maybe incomplete) :" + LERROR << "createTransition: confused by no_keep format (maybe incomplete) :" << Common::Misc::limastring2utf8stdstring(str); } else { @@ -239,6 +282,14 @@ TransitionUnit* createTransition(const LimaString str, t = createDefaultTStatusTransition(s,LENGTH_TSTATUS_TR); } // ---------------------------------------------------------------------- + // GazeteerTransition: form belongs to gazeteer + /* + else if (s.indexOf(CHAR_BEGIN_NAMEGAZ,0) == 0) { + // name of gazeteer already identified! + t = new GazeteerTransition(gazeteerAsVectorOfString,alias,keep); + } + */ + // ---------------------------------------------------------------------- // * transition else if (s == STRING_ANY_TR) { t = new StarTransition(); @@ -246,14 +297,24 @@ TransitionUnit* createTransition(const LimaString str, // ---------------------------------------------------------------------- // entity transition else if (s.size()>=2 && s[0]==CHAR_BEGIN_ENTITY && s[s.size()-1]==CHAR_END_ENTITY) { - Common::MediaticData::EntityType type= - resolveEntityName(s.mid(1,s.size()-2),activeEntityGroups); + LimaString entityName(s.mid(1,s.size()-2)); + Common::MediaticData::EntityType type=resolveEntityName(entityName,activeEntityGroups); if (type.isNull()) { - AUCLOGINIT; - LERROR << "cannot resolve entity name " - << Common::Misc::limastring2utf8stdstring(s); + Common::MediaticData::EntityGroupId groupId = resolveGroupName(entityName,activeEntityGroups); + if( groupId == 0) { + AUCLOGINIT; + LERROR << "createTransition: cannot resolve entity name " + << Common::Misc::limastring2utf8stdstring(s); + } + else { + AUCLOGINIT; + LDEBUG << "createTransition: create EntityGroupTransition(" << groupId << ")"; + t=new EntityGroupTransition(groupId); + } } else { + AUCLOGINIT; + LDEBUG << "createTransition: create EntityTransition(" << type << ")"; t=new EntityTransition(type); } } @@ -283,6 +344,46 @@ TransitionUnit* createTransition(const LimaString str, return t; } +//********************************************************************** +// +Common::MediaticData::EntityGroupId +resolveGroupName(const LimaString s, + const std::vector& activeEntityGroups) +{ +#ifdef DEBUG_LP + AUCLOGINIT; + LDEBUG << "resolveGroupName: try to resolve group name " + << Common::Misc::limastring2utf8stdstring(s); +#endif + Common::MediaticData::EntityGroupId foundGroup; + try { + LimaString groupName=s; +#ifdef DEBUG_LP + LDEBUG << "resolveGroupName: try group name " << Common::Misc::limastring2utf8stdstring(s); +#endif + foundGroup = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName); + // group is among active groups +#ifdef DEBUG_LP + LDEBUG << "resolveGroupName: foundGroup" << foundGroup; +#endif + for (vector::const_iterator it=activeEntityGroups.begin(), + it_end=activeEntityGroups.end(); it!=it_end; it++) { + if( groupName == *it ) { + return foundGroup; + } + AUCLOGINIT; + LERROR << "resolveGroupName: group " << Common::Misc::limastring2utf8stdstring(s) << " not active"; + return foundGroup; + } + } + catch (LimaException& e) { + AUCLOGINIT; + LERROR << "resolveGroupName: cannot resolve group for " + << Common::Misc::limastring2utf8stdstring(s); + } + return foundGroup; +} + //********************************************************************** // Common::MediaticData::EntityType @@ -291,21 +392,21 @@ resolveEntityName(const LimaString s, { #ifdef DEBUG_LP AUCLOGINIT; - LDEBUG << "TransitionCompiler: try to resolve entity name " + LDEBUG << "resolveEntityName: try to resolve entity name " << Common::Misc::limastring2utf8stdstring(s); #endif // test if word is a known entity name => in this case, entity transition if (s.indexOf(Common::MediaticData::MediaticData::single().getEntityTypeNameSeparator())!=-1) { #ifdef DEBUG_LP - LDEBUG << "TransitionCompiler: entity name is complete"; + LDEBUG << "resolveEntityName: entity name is complete"; #endif try { return Common::MediaticData::MediaticData::single().getEntityType(s); } catch (LimaException& e) { AUCLOGINIT; - LERROR << "unknown entity " << s; + LERROR << "resolveEntityName: unknown entity " << s; } } else { // try to find this entity in active groups @@ -315,14 +416,14 @@ resolveEntityName(const LimaString s, try { LimaString entityName=(*it)+Common::MediaticData::MediaticData::single().getEntityTypeNameSeparator()+s; #ifdef DEBUG_LP - LDEBUG << "TransitionCompiler: try entity name " << Common::Misc::limastring2utf8stdstring(entityName); + LDEBUG << "resolveEntityName: try entity name " << Common::Misc::limastring2utf8stdstring(entityName); #endif Common::MediaticData::EntityType findType= Common::MediaticData::MediaticData::single().getEntityType(entityName); if (!type.isNull()) { // there is ambiguity AUCLOGINIT; - LERROR << "cannot resolve entity group for entity " + LERROR << "resolveEntityName: cannot resolve entity group for entity " << Common::Misc::limastring2utf8stdstring(s) << " (at least two groups contain this entity)"; } @@ -333,14 +434,14 @@ resolveEntityName(const LimaString s, catch (LimaException& e) { // not in this group: do nothing (continue search) #ifdef DEBUG_LP - LDEBUG << "entity " << Common::Misc::limastring2utf8stdstring(s) + LDEBUG << "resolveEntityName: entity " << Common::Misc::limastring2utf8stdstring(s) << " not in group " << Common::Misc::limastring2utf8stdstring(*it); #endif } } - if (type.isNull()) { + if (type.isNull()) { // try to interpret s as group AUCLOGINIT; - LERROR << "cannot resolve entity group for entity " + LERROR << "resolveEntityName: cannot resolve entity group for entity " << Common::Misc::limastring2utf8stdstring(s) << " (no active group contains this entity)"; } @@ -411,7 +512,7 @@ Tpos createTpos(const std::string& s, MediaId language) { //search for separator '_' int sep(findSpecialCharacter(Common::Misc::utf8stdstring2limastring(s),CHAR_SEP_MACROMICRO_STRING,0)); if (sep != -1 && string(s,0,sep) == "L") { - // '_' found after L (L_NC) + // '_' found after L (NC) sep=findSpecialCharacter(Common::Misc::utf8stdstring2limastring(s),CHAR_SEP_MACROMICRO_STRING,sep+1); } if (sep == -1) { // only macro diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h index 96db81ddf..72ddfbffa 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h @@ -47,6 +47,22 @@ Lima::LinguisticProcessing::Automaton::TransitionUnit* MediaId language, const std::string& id, const std::vector& activeEntityGroups); +/** + * Lima::LinguisticProcessing::Automaton::TransitionUnit* + createGazeteerTransition(const AutomatonString& automatonString, + MediaId language, const std::string& id, + const std::vector& activeEntityGroups, + const std::vector& gazeteerAsVectorOfString, + const bool keepTrigger); +*/ +Lima::LinguisticProcessing::Automaton::TransitionUnit* + createGazeteerTransition(const LimaString& gazeteerName, + MediaId language, const std::string& id, + const std::vector& activeEntityGroups, + const std::vector& gazeteers, + const bool keep=true, + const bool head=false); + Lima::LinguisticProcessing::Automaton::TransitionUnit* createTransition(const LimaString, MediaId language, const std::string& id, @@ -54,11 +70,15 @@ Lima::LinguisticProcessing::Automaton::TransitionUnit* const bool keep=true, const bool neg=false, const std::vector& constraints= - std::vector(0)); + std::vector(0), + const std::vector& gazeteerAsVectorOfString = std::vector(0) ); Common::MediaticData::EntityType resolveEntityName(const LimaString str, const std::vector& activeEntityGroups); +Common::MediaticData::EntityGroupId + resolveGroupName(const LimaString s, + const std::vector& activeEntityGroups); } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp index 05088bdd9..ad96c73cc 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp @@ -44,8 +44,7 @@ namespace Automaton { //*************************************************************************** void getlineLimaString(std::istream& in, LimaString& s) { // first get a string and convert it to wstring - std::string tmp; - getline(in,tmp); + std::string tmp = Lima::Common::Misc::readLine(in); s=Common::Misc::utf8stdstring2limastring(tmp); } diff --git a/lima_linguisticprocessing/tools/common/catBowFiles.cpp b/lima_linguisticprocessing/tools/common/catBowFiles.cpp index 7830fb450..6c86907bf 100644 --- a/lima_linguisticprocessing/tools/common/catBowFiles.cpp +++ b/lima_linguisticprocessing/tools/common/catBowFiles.cpp @@ -102,7 +102,7 @@ void readAndWriteBoWDocuments(ifstream& fileIn, BinaryWriterBoWDocumentHandler writer(fileOut); while (! fileIn.eof()) { document->reinit(); - reader.readBoWDocumentBlock(fileIn, *document, writer, false); + reader.readBoWDocumentBlock(fileIn, *document, writer, false, false); } } diff --git a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp index d6f6d36c8..35064419e 100644 --- a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp +++ b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp @@ -211,9 +211,9 @@ class GetLexiconBoWDocumentHandler : public AbstractBoWDocumentHandler const std::string& /*elementName*/) {} void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool /*useIndexIterator*/); void processProperties(const Misc::GenericDocumentProperties* /*properties*/, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) {} void closeSBoWNode() {} @@ -227,7 +227,7 @@ class GetLexiconBoWDocumentHandler : public AbstractBoWDocumentHandler }; void GetLexiconBoWDocumentHandler::processSBoWText(const BoWText* text, - bool useIterators) + bool useIterators, bool /*useIndexIterator*/) { LIMA_UNUSED(useIterators); BoWTokenIterator it(*text); @@ -306,7 +306,7 @@ void readDocuments(ifstream& fileIn, BoWDocument* document, referenceProperties,filterCategory); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn,*document,handler,true); + reader.readBoWDocumentBlock(fileIn,*document,handler,true,false); } } @@ -358,8 +358,8 @@ int run(int argc,char** argv) } - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); if ( (!param.language.size()) && (!param.codeFile.size()) ) { cerr << "no codefile nor language specified !" << endl; diff --git a/lima_linguisticprocessing/tools/common/parseXMLFile.cpp b/lima_linguisticprocessing/tools/common/parseXMLFile.cpp index d6abcf7f7..8b4db3d03 100644 --- a/lima_linguisticprocessing/tools/common/parseXMLFile.cpp +++ b/lima_linguisticprocessing/tools/common/parseXMLFile.cpp @@ -128,9 +128,9 @@ int run(int argc,char** argv) exit(0); } - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); XMLConfigurationFileParser parser(param.inputFile); diff --git a/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp b/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp index df70b8a4b..967c70c20 100644 --- a/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp +++ b/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp @@ -23,6 +23,7 @@ #include "common/LimaCommon.h" +#include "common/Data/strwstrtools.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include @@ -218,10 +219,10 @@ int run(int argc,char** argv) string line; while (fin.good() && !fin.eof()) { - getline(fin,line); - if (line.size()>0) { - LinguisticCode prop(atoi(line.c_str())); - decode(propcodemanager,prop); + line = Lima::Common::Misc::readLine(fin); + if (line.size()>0) { + LinguisticCode prop(atoi(line.c_str())); + decode(propcodemanager,prop); } } } diff --git a/lima_linguisticprocessing/tools/common/readBoWFile.cpp b/lima_linguisticprocessing/tools/common/readBoWFile.cpp index c65d86214..85047fc3d 100644 --- a/lima_linguisticprocessing/tools/common/readBoWFile.cpp +++ b/lima_linguisticprocessing/tools/common/readBoWFile.cpp @@ -262,7 +262,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re TextWriterBoWDocumentHandler writer(cout); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } break; } @@ -272,7 +272,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re writer.writeBoWDocumentsHeader(); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } writer.writeBoWDocumentsFooter(); } @@ -281,7 +281,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re SBoWStatWriter writer; while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } cout << writer << endl; break; diff --git a/lima_linguisticprocessing/tools/common/readLinguisticData.cpp b/lima_linguisticprocessing/tools/common/readLinguisticData.cpp index 694200708..67ffee467 100644 --- a/lima_linguisticprocessing/tools/common/readLinguisticData.cpp +++ b/lima_linguisticprocessing/tools/common/readLinguisticData.cpp @@ -68,9 +68,9 @@ int run(int argc,char** argv) // Necessary to initialize factories Lima::AmosePluginsManager::single(); - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string configFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); std::deque langs; diff --git a/lima_linguisticprocessing/tools/common/testAccessMethod.cpp b/lima_linguisticprocessing/tools/common/testAccessMethod.cpp index ff528f886..acb11dada 100644 --- a/lima_linguisticprocessing/tools/common/testAccessMethod.cpp +++ b/lima_linguisticprocessing/tools/common/testAccessMethod.cpp @@ -183,9 +183,9 @@ int run(int argc,char** argv) void testAccessMethod(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); AbstractAccessByString* accessMethod(0); if (param.accessMethod == "fsa") diff --git a/lima_linguisticprocessing/tools/common/testContentDict16.cpp b/lima_linguisticprocessing/tools/common/testContentDict16.cpp index 27510fc5d..7ac81b185 100644 --- a/lima_linguisticprocessing/tools/common/testContentDict16.cpp +++ b/lima_linguisticprocessing/tools/common/testContentDict16.cpp @@ -49,6 +49,20 @@ using namespace std; #include +#define ANTINNO_SPECIFIC_LOG + +#ifdef ANTINNO_SPECIFIC_LOG +// FWI 12/05/2015 utilisation de composants d's3 +#include "antinno.s3.config.h" +#include "antinno.s3.fs.File.class.h" +#include "antinno.s3.fs.Directory.class.h" +#include "antinno.s3.fs.FileName.class.h" +#include "antinno.s3.log.Log4cpp.class.h" +#if defined WIN32 +#include "windows.h" +#endif +#endif + //#include "common/linguisticData/linguisticData.h" using namespace Lima; @@ -99,10 +113,36 @@ int main(int argc, char **argv) int run(int argc,char** argv) { +#ifndef ANTINNO_SPECIFIC_LOG QsLogging::initQsLog(); // Necessary to initialize factories Lima::AmosePluginsManager::single(); +#else + LoadLibrary("antinno.s3lib.dll"); + static ::antinno::s3::log::Log4cpp log1; + { + + using namespace ::antinno; + QString const c = ::std::getenv("AMOSE_CONF"); + if (c.isEmpty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + + QString log4cppFilePath = c + "/" + "AntTextIndexer.log4cpp"; + s3::fs::File const log4cppFile((s3::fs::Path(::boost::locale::conv::utf_to_utf(log4cppFilePath.toUtf8().constData()).c_str()))); + //::std::wcout << log4cppFile << ::std::endl; + log1.configure(log4cppFile); + ::antinno::s3::global.log(log1); + if (!QsLogging::Categories::instance().configure(log4cppFilePath.toAscii().constData())) + { + std::cerr << "Configure Problem " << log4cppFilePath.toAscii().constData() << std::endl; + return EXIT_FAILURE; + } + } +#endif cerr << "testContentDict16 begin..." << endl; setlocale(LC_ALL, ""); @@ -257,9 +297,9 @@ const Lima::LimaString& word) const{ void testAnalysisDico(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); MyAnalysisDico* dico = new MyAnalysisDico(analysisDataElement(0)); dico->parseAccessMethod(param.keyFileName); diff --git a/lima_linguisticprocessing/tools/common/testReadLexicon.cpp b/lima_linguisticprocessing/tools/common/testReadLexicon.cpp index d0734d0c7..d4ed34e61 100644 --- a/lima_linguisticprocessing/tools/common/testReadLexicon.cpp +++ b/lima_linguisticprocessing/tools/common/testReadLexicon.cpp @@ -47,9 +47,9 @@ Param; void testAccessMethod(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string commonConfigFile=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string("lima-common.xml"); - string configDir=string(getenv("LIMA_CONF")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string commonConfigFile=string("lima-common.xml"); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); // Load lexicon Lima::Common::FsaAccess::FsaAccessSpare16* fsaAccess=new Lima::Common::FsaAccess::FsaAccessSpare16(); diff --git a/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp b/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp index d58afe651..ea74e4e17 100644 --- a/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp +++ b/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp @@ -1,318 +1,360 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ - -#include -#include -#include -#include -#include - -#include "common/LimaCommon.h" -#include "common/Data/strwstrtools.h" -#include "common/Data/LimaString.h" -#include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" -#include "common/FsaAccess/FsaAccessSpare16.h" -#include "common/misc/AbstractAccessByString.h" -#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" -// #include "linguisticProcessing/core/Tokenizer/ParseChar.h" -// #include "linguisticProcessing/core/Tokenizer/ParseCharClass.h" - -#include "KeysLogger.h" -#include "DictionaryHandler.h" - -#include -#include - -using namespace std; -using namespace Lima; -using namespace Lima::Common; -using namespace Lima::Common::PropertyCode; -using namespace Lima::Common::FsaAccess; -using namespace Lima::Common::Misc; -using namespace Lima::LinguisticProcessing; -using namespace Lima::LinguisticProcessing::FlatTokenizer; - -void usage() -{ - std::cerr << "USAGE : compileDictionary [OPTIONS] file" << std::endl; - std::cerr << "where [OPTIONS] are : " << std::endl; - std::cerr << " --extractKeyList= : only extract keys list to file, no compilation" << endl; - std::cerr << " --charChart= : specify charchart file" << endl; - std::cerr << " --fsaKey= : provide fsa access keys to compile" << endl; - std::cerr << " --propertyFile= : specify property coding system (xml file)" << endl; - std::cerr << " --symbolicCodes= : specify symbolic codes file (xml)" << endl; - std::cerr << " --output= : specify output file" << endl; - std::cerr << " --reverse-keys : reverse entries keys" << endl; -} - -// options -typedef struct ParamStruct -{ - std::string extractKeys; - std::string charChart; - std::string fsaKey; - std::string propertyFile; - std::string symbolicCodes; - std::string output; - std::string input; - bool reverseKeys; -} -Param; - - -#include "common/tools/LimaMainTaskRunner.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include - -int run(int aargc,char** aargv); - -int main(int argc, char **argv) -{ - QCoreApplication a(argc, argv); - - // Task parented to the application so that it - // will be deleted by the application. - LimaMainTaskRunner* task = new LimaMainTaskRunner(argc, argv, run, &a); - - // This will cause the application to exit when - // the task signals finished. - QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); - - // This will run the task from the application event loop. - QTimer::singleShot(0, task, SLOT(run())); - - return a.exec(); - -} - - -int run(int argc,char** argv) -{ - QsLogging::initQsLog(); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - - setlocale(LC_ALL,"fr_FR.UTF-8"); - - Param param = { - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - false}; - - - for (int i = 1 ; i < argc; i++) - { - std::string arg(argv[i]); - int pos = -1; - if (arg == "--help") - { - usage(); - return 0; - } - if ( (pos = arg.find("--extractKeyList=")) != -1 ) - { - param.extractKeys = arg.substr(pos+17); - } - else if ( (pos = arg.find("--fsaKey=")) != -1 ) - { - param.fsaKey = arg.substr(pos+9); - } - else if ( (pos = arg.find("--charChart=")) != -1 ) - { - param.charChart = arg.substr(pos+12); - } - else if ( (pos = arg.find("--propertyFile=")) != -1 ) - { - param.propertyFile = arg.substr(pos+15); - } - else if ( (pos = arg.find("--symbolicCodes=")) != -1 ) - { - param.symbolicCodes = arg.substr(pos+16); - } - else if ( (pos = arg.find("--output=")) != -1 ) - { - param.output = arg.substr(pos+9); - } - else if ( (pos = arg.find("--reverse-keys")) != -1 ) - { - param.reverseKeys = true; - } - else - { - param.input = arg; - } - } - - // check that input file exists - { - ifstream fin(param.input.c_str(), std::ifstream::binary); - if (!fin.good()) - { - cerr << "can't open input file " << param.input << endl; - exit(-1); - } - fin.close(); - } - - // parse charchart - if (param.charChart == "") { - cerr << "please specify CharChart file with --charChart= option" << endl; - exit(0); - } - CharChart* charChart = new CharChart(); - charChart->loadFromFile(param.charChart); - - try - { - cerr << "parse charChart file : " << param.charChart << endl; -// cerr << "TODO: to implement at "<<__FILE__<<", line "<<__LINE__<<"!" <setValidationScheme(SAXParser::Val_Auto); - // parser->setDoNamespaces(false); - // parser->setDoSchema(false); - // parser->setValidationSchemaFullChecking(false); - parser.setContentHandler(&keysLogger); - parser.setErrorHandler(&keysLogger); - QFile file(param.input.c_str()); - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Error opening " << param.input << std::endl; - return 1; - } - if (!parser.parse( QXmlInputSource(&file))) - { - std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; - return 1; - } - else - { - std::cerr << std::endl; - } - } - catch (const XMLException& toCatch) - { - std::cerr << "An error occurred Error: " << toCatch.getMessage() << endl; - throw; - } - fout.close(); - } else { - // compile dictionaries - - cerr << "parse property code file : " << param.propertyFile << endl; - PropertyCodeManager propcodemanager; - propcodemanager.readFromXmlFile(param.propertyFile); - - cerr << "parse symbolicCode file : " << param.symbolicCodes << endl; - map conversionMap; - propcodemanager.convertSymbolicCodes(param.symbolicCodes,conversionMap); - cerr << conversionMap.size() << " code read from symbolicCode file" << endl; -/* for (map::const_iterator it=conversionMap.begin(); - it!=conversionMap.end(); - it++) - { - cerr << it->first << " -> " << it->second << endl; - }*/ - - AbstractAccessByString* access(0); - if (param.fsaKey!="") { - cerr << "load fsa access method : " << param.fsaKey << endl; - FsaAccessSpare16* fsaAccess=new FsaAccessSpare16(); - fsaAccess->read(param.fsaKey); - access=fsaAccess; - } else { - cerr << "ERROR : no access Keys defined !" << endl; - exit(-1); - } - cerr << access->getSize() << " keys loaded" << endl; - - cerr << "parse input file : " << param.input << endl; - DictionaryCompiler handler(charChart,access,conversionMap,param.reverseKeys); - - QXmlSimpleReader parser; -// parser->setValidationScheme(SAXParser::Val_Auto); -// parser->setDoNamespaces(false); -// parser->setDoSchema(false); -// parser->setValidationSchemaFullChecking(false); - try - { - parser.setContentHandler(&handler); - parser.setErrorHandler(&handler); - QFile file(param.input.c_str()); - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Error opening " << param.input << std::endl; - return 1; - } - if (!parser.parse( QXmlInputSource(&file))) - { - std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; - return 1; - } - } - catch (const XMLException& toCatch) - { - cerr << "An error occurred Error: " << toCatch.getMessage() << endl; - throw; - } - - cerr << "write data to output file : " << param.output << endl; - ofstream fout(param.output.c_str(),ios::out | ios::binary); - if (!fout.good()) - { - cerr << "can't open file " << param.output << endl; - exit(-1); - } - handler.writeBinaryDictionary(fout); - fout.close(); - delete access; - } - return EXIT_SUCCESS; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include +#include +#include +#include +#include + +#include "common/LimaCommon.h" +#include "common/Data/strwstrtools.h" +#include "common/Data/LimaString.h" +#include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/misc/AbstractAccessByString.h" +#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" +// #include "linguisticProcessing/core/Tokenizer/ParseChar.h" +// #include "linguisticProcessing/core/Tokenizer/ParseCharClass.h" + +#include "KeysLogger.h" +#include "DictionaryHandler.h" + +#include +#include + +#ifdef ANTINNO_SPECIFIC +#include "common/AbstractFactoryPattern/antinno.LibraryLoader.class.h" +#endif + +using namespace std; +using namespace Lima; +using namespace Lima::Common; +using namespace Lima::Common::PropertyCode; +using namespace Lima::Common::FsaAccess; +using namespace Lima::Common::Misc; +using namespace Lima::LinguisticProcessing; +using namespace Lima::LinguisticProcessing::FlatTokenizer; + +void usage() +{ + std::cerr << "USAGE : compileDictionary [OPTIONS] file" << std::endl; + std::cerr << "where [OPTIONS] are : " << std::endl; + std::cerr << " --extractKeyList= : only extract keys list to file, no compilation" << endl; + std::cerr << " --charChart= : specify charchart file" << endl; + std::cerr << " --fsaKey= : provide fsa access keys to compile" << endl; + std::cerr << " --propertyFile= : specify property coding system (xml file)" << endl; + std::cerr << " --symbolicCodes= : specify symbolic codes file (xml)" << endl; + std::cerr << " --output= : specify output file" << endl; + std::cerr << " --reverse-keys : reverse entries keys" << endl; +} + +// options +typedef struct ParamStruct +{ + std::string extractKeys; + std::string charChart; + std::string fsaKey; + std::string propertyFile; + std::string symbolicCodes; + std::string output; + std::string input; + bool reverseKeys; +} +Param; + + +#include "common/tools/LimaMainTaskRunner.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#include + +int run(int aargc,char** aargv); + +int main(int argc, char **argv) +{ + QCoreApplication a(argc, argv); + + // Task parented to the application so that it + // will be deleted by the application. + LimaMainTaskRunner* task = new LimaMainTaskRunner(argc, argv, run, &a); + + // This will cause the application to exit when + // the task signals finished. + QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); + + // This will run the task from the application event loop. + QTimer::singleShot(0, task, SLOT(run())); + + return a.exec(); + +} + + +int run(int argc,char** argv) +{ +#ifdef ANTINNO_SPECIFIC + + + { + + + ::std::string const configDir = ::std::getenv("AMOSE_CONF"); + if (configDir.empty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + + try + { + ::std::string const file = configDir + "/plugins.txt"; + Lima::antinno::LibraryLoader().loadFromFile(file); + } + catch (::std::exception const& ex) + { + std::cerr << "Exception during plugins loading. " << ex.what() << std::endl; + return EXIT_FAILURE; + } + + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + QsLogging::antinno::log = pLog1; + if (!QsLogging::Categories::instance().configure(log4cppFilePath.data())) + { + std::cerr << "Configure Problem " << log4cppFilePath << std::endl; + return EXIT_FAILURE; + } + + ::std::cout << "Plugins initialized" << ::std::endl; + } +#else + QsLogging::initQsLog(); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); +#endif + + setlocale(LC_ALL,"fr_FR.UTF-8"); + + Param param = { + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + false}; + + + for (int i = 1 ; i < argc; i++) + { + std::string arg(argv[i]); + int pos = -1; + if (arg == "--help") + { + usage(); + return 0; + } + if ( (pos = arg.find("--extractKeyList=")) != -1 ) + { + param.extractKeys = arg.substr(pos+17); + } + else if ( (pos = arg.find("--fsaKey=")) != -1 ) + { + param.fsaKey = arg.substr(pos+9); + } + else if ( (pos = arg.find("--charChart=")) != -1 ) + { + param.charChart = arg.substr(pos+12); + } + else if ( (pos = arg.find("--propertyFile=")) != -1 ) + { + param.propertyFile = arg.substr(pos+15); + } + else if ( (pos = arg.find("--symbolicCodes=")) != -1 ) + { + param.symbolicCodes = arg.substr(pos+16); + } + else if ( (pos = arg.find("--output=")) != -1 ) + { + param.output = arg.substr(pos+9); + } + else if ( (pos = arg.find("--reverse-keys")) != -1 ) + { + param.reverseKeys = true; + } + else + { + param.input = arg; + } + } + + // check that input file exists + { + ifstream fin(param.input.c_str(), std::ifstream::binary); + if (!fin.good()) + { + cerr << "can't open input file " << param.input << endl; + exit(-1); + } + fin.close(); + } + + // parse charchart + if (param.charChart == "") { + cerr << "please specify CharChart file with --charChart= option" << endl; + exit(0); + } + CharChart* charChart = new CharChart(); + charChart->loadFromFile(param.charChart); + + try + { + cerr << "parse charChart file : " << param.charChart << endl; +// cerr << "TODO: to implement at "<<__FILE__<<", line "<<__LINE__<<"!" <setValidationScheme(SAXParser::Val_Auto); + // parser->setDoNamespaces(false); + // parser->setDoSchema(false); + // parser->setValidationSchemaFullChecking(false); + parser.setContentHandler(&keysLogger); + parser.setErrorHandler(&keysLogger); + QFile file(param.input.c_str()); + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Error opening " << param.input << std::endl; + return 1; + } + if (!parser.parse( QXmlInputSource(&file))) + { + std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; + return 1; + } + else + { + std::cerr << std::endl; + } + } + catch (const XMLException& toCatch) + { + std::cerr << "An error occurred Error: " << toCatch.getMessage() << endl; + throw; + } + fout.close(); + } else { + // compile dictionaries + + cerr << "parse property code file : " << param.propertyFile << endl; + PropertyCodeManager propcodemanager; + propcodemanager.readFromXmlFile(param.propertyFile); + + cerr << "parse symbolicCode file : " << param.symbolicCodes << endl; + map conversionMap; + propcodemanager.convertSymbolicCodes(param.symbolicCodes,conversionMap); + cerr << conversionMap.size() << " code read from symbolicCode file" << endl; +/* for (map::const_iterator it=conversionMap.begin(); + it!=conversionMap.end(); + it++) + { + cerr << it->first << " -> " << it->second << endl; + }*/ + + AbstractAccessByString* access(0); + if (param.fsaKey!="") { + cerr << "load fsa access method : " << param.fsaKey << endl; + FsaAccessSpare16* fsaAccess=new FsaAccessSpare16(); + fsaAccess->read(param.fsaKey); + access=fsaAccess; + } else { + cerr << "ERROR : no access Keys defined !" << endl; + exit(-1); + } + cerr << access->getSize() << " keys loaded" << endl; + + cerr << "parse input file : " << param.input << endl; + DictionaryCompiler handler(charChart,access,conversionMap,param.reverseKeys); + + QXmlSimpleReader parser; +// parser->setValidationScheme(SAXParser::Val_Auto); +// parser->setDoNamespaces(false); +// parser->setDoSchema(false); +// parser->setValidationSchemaFullChecking(false); + try + { + parser.setContentHandler(&handler); + parser.setErrorHandler(&handler); + QFile file(param.input.c_str()); + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Error opening " << param.input << std::endl; + return 1; + } + if (!parser.parse( QXmlInputSource(&file))) + { + std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; + return 1; + } + } + catch (const XMLException& toCatch) + { + cerr << "An error occurred Error: " << toCatch.getMessage() << endl; + throw; + } + + cerr << "write data to output file : " << param.output << endl; + ofstream fout(param.output.c_str(),ios::out | ios::binary); + if (!fout.good()) + { + cerr << "can't open file " << param.output << endl; + exit(-1); + } + handler.writeBinaryDictionary(fout); + fout.close(); + delete access; + } + return EXIT_SUCCESS; +} diff --git a/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp b/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp index e0c77c8ea..4309cc3ab 100644 --- a/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp +++ b/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp @@ -34,8 +34,10 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "common/Data/LimaString.h" +#include "common/tools/FileUtils.h" #include "common/misc/fsaStringsPool.h" #include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/FsaAccess/FsaAccessSpare16.h" #include "linguisticProcessing/core/AnalysisDict/AbstractAnalysisDictionary.h" #include "linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.h" #include "DictionaryEntryLogger.h" @@ -64,6 +66,7 @@ typedef struct ParamStruct std::string defaultDataFileName; std::string key; std::string keyFile; + std::string limaConfigFile; int offset; bool superword; bool withDebug; @@ -120,6 +123,7 @@ int run(int argc,char** argv) std::string(""), std::string(""), std::string(""), + std::string(""), -1, false, false @@ -138,6 +142,10 @@ int run(int argc,char** argv) { param.language = arg.substr(pos+11); } + else if ( (pos = arg.find("--limaConfigFile=")) != std::string::npos ) + { + param.limaConfigFile = arg.substr(pos+17); + } else if ( (pos = arg.find("--dicoId=")) != std::string::npos ) { param.dicoId = arg.substr(pos+9); @@ -182,8 +190,8 @@ int run(int argc,char** argv) } - std::string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - std::string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + std::string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + std::string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); std::string commonConfigFile="/lima-common.xml"; deque langs; langs.push_back(param.language); @@ -199,17 +207,21 @@ int run(int argc,char** argv) cout << " --dicoId='" << param.dicoId << "'" << endl; - string configPath=Common::MediaticData::MediaticData::single().getConfigPath(); + QString configPath=QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()); cout << "load language " << param.language << endl; MediaId langid=MediaticData::single().getMediaId(param.language); - string file; + QString file; try { - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configPath + "/lima-analysis.xml"); - file=configPath + "/" + configuration.getModuleGroupParamValue( + QString configurationFile = Common::Misc::findFileInPaths(configPath, QString::fromUtf8("lima-analysis.xml")); + if (! param.limaConfigFile.empty()) { + configurationFile=QString::fromUtf8(param.limaConfigFile.c_str()); + } + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configurationFile.toUtf8().constData()); + file = Common::Misc::findFileInPaths(configPath, QString::fromUtf8( configuration.getModuleGroupParamValue( "lima-coreclient", "mediaProcessingDefinitionFiles", - param.language); + param.language).c_str() ) ); } catch (NoSuchParam& ) { @@ -217,7 +229,7 @@ int run(int argc,char** argv) throw InvalidConfiguration(); } - XMLConfigurationFileParser langParser(file); + XMLConfigurationFileParser langParser(file.toUtf8().constData()); // initialize resources try diff --git a/lima_linguisticprocessing/tools/normalize/desaccent.cpp b/lima_linguisticprocessing/tools/normalize/desaccent.cpp index 5153bf129..ffcf4a8c7 100644 --- a/lima_linguisticprocessing/tools/normalize/desaccent.cpp +++ b/lima_linguisticprocessing/tools/normalize/desaccent.cpp @@ -187,7 +187,7 @@ int run(int argc,char** argv) string line; if (fin.good()) { - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); while (fin.good() && !fin.eof() && line!="") { LimaString str=utf8stdstring2limastring(line); @@ -201,7 +201,7 @@ int run(int argc,char** argv) res = charChart->unmark(str); } cout << limastring2utf8stdstring(res) << endl; - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); } } } diff --git a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp index a9271d3f3..1f7e52dc6 100644 --- a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp +++ b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp @@ -22,6 +22,7 @@ ***************************************************************************/ #include "common/LimaCommon.h" +#include "common/tools/FileUtils.h" #include "common/tools/LimaMainTaskRunner.h" #include "common/MediaticData/mediaticData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" @@ -89,10 +90,6 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - bool docatch = false; if (argc>1) { @@ -125,12 +122,10 @@ int run(int argc,char** argv) return dowork(argc,argv); } - int dowork(int argc,char* argv[]) { - - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string resourcesPathParam=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string configPathParam=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); string lpConfigFile=string("lima-analysis.xml"); string commonConfigFile=string("lima-common.xml"); string pipeline=string("normalization"); @@ -161,9 +156,9 @@ int dowork(int argc,char* argv[]) else if ( (pos = arg.find("--common-config-file=")) != std::string::npos ) commonConfigFile = arg.substr(pos+21); else if ( (pos = arg.find("--config-dir=")) != std::string::npos ) - configDir = arg.substr(pos+13); + configPathParam = arg.substr(pos+13); else if ( (pos = arg.find("--resources-dir=")) != std::string::npos ) - resourcesPath = arg.substr(pos+16); + resourcesPathParam = arg.substr(pos+16); else if ( (pos = arg.find("--language=")) != std::string::npos ) langs.push_back(arg.substr(pos+11)); // else if ( (pos = arg.find("--pipeline=")) != std::string::npos ) @@ -185,29 +180,50 @@ int dowork(int argc,char* argv[]) return -1; } - AbstractLinguisticProcessingClient* client(0); + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + if (!configPathParam.empty()) + { + configPath = QString::fromUtf8(configPathParam.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + if (!resourcesPathParam.empty()) + { + resourcesPath = QString::fromUtf8(resourcesPathParam.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } + + QsLogging::initQsLog(configPath); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); try { // initialize common MediaticData::changeable().init( - resourcesPath, - configDir, + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); // initialize linguistic processing deque pipelines; pipelines.push_back(pipeline); - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); + + QString lpConfigFileFound = Common::Misc::findFileInPaths(configPath, lpConfigFile.c_str(), LIMA_PATH_SEPARATOR); + + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(lpConfigFileFound.toUtf8().constData()); LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); + clientId, + lpconfig, + langs, + pipelines); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + shared_ptr client= std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -233,7 +249,7 @@ int dowork(int argc,char* argv[]) char buf[256]; file.getline(buf,256); std::string line(buf); - while (!file.eof()) + while (file.good()) { if (line.size()==0) { @@ -248,13 +264,12 @@ int dowork(int argc,char* argv[]) // analyze it metaData["FileName"]=*fileItr; - - // Lima::TimeUtilsController *timer = new Lima::TimeUtilsController("test",true); + + // Lima::TimeUtilsController *timer = new Lima::TimeUtilsController("test",true); client->analyze(contentText,metaData,pipeline,handlers); - // delete timer; - - - + // delete timer; + + // analyze resulting bowText to extract normalization multimap norms=extractNormalization(contentText,bowTextHandler.getBowText(),lang); if (norms.empty()) @@ -282,7 +297,6 @@ int dowork(int argc,char* argv[]) throw e; } - delete client; return SUCCESS_ID; } diff --git a/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp b/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp index 1c5d5ca25..1dbd35d0a 100644 --- a/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp +++ b/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp @@ -89,7 +89,7 @@ TestCaseError AnalysisTestCaseProcessor::processTestCase(const Lima::Common::TGV ofstream fout(outputfile.c_str(), std::ofstream::binary); fout << "" << endl; Common::BagOfWords::BoWXMLWriter writer(fout); - writer.writeBoWText(&text, true); + writer.writeBoWText(&text, true, false); fout.close(); TestCaseError error = evalTestCase( testCase, *pipItr, filename, filenameWithPipeLine ); diff --git a/lima_linguisticprocessing/tools/tva/tva.cpp b/lima_linguisticprocessing/tools/tva/tva.cpp index 444edcd5b..d9f30dc3e 100644 --- a/lima_linguisticprocessing/tools/tva/tva.cpp +++ b/lima_linguisticprocessing/tools/tva/tva.cpp @@ -32,6 +32,7 @@ #include "common/MediaticData/mediaticData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/Handler/AbstractAnalysisHandler.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/client/AbstractLinguisticProcessingClient.h" #include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" @@ -45,6 +46,7 @@ #include "common/AbstractFactoryPattern/AmosePluginsManager.h" // #endif +using namespace Lima::Common::Misc; using namespace Lima::Common::TGV; using namespace Lima::AnalysisValidation; using namespace Lima::LinguisticProcessing; @@ -82,12 +84,19 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + QsLogging::initQsLog(configPath); // Necessary to initialize factories Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); - std::string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":std::string(getenv("LIMA_RESOURCES")); - std::string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":std::string(getenv("LIMA_CONF")); + std::string strConfigPath; + std::string strResourcesPath; std::string lpConfigFile=std::string("lima-lp-tva.xml"); std::string commonConfigFile=std::string("lima-common.xml"); std::string clientId=std::string("lima-coreclient"); @@ -112,9 +121,9 @@ int run(int argc,char** argv) else if ( (pos = arg.find("--common-config-file=")) != std::string::npos ) commonConfigFile = arg.substr(pos+21); else if ( (pos = arg.find("--config-dir=")) != std::string::npos ) - configDir = arg.substr(pos+13); + strConfigPath = arg.substr(pos+13); else if ( (pos = arg.find("--resources-dir=")) != std::string::npos ) - resourcesPath = arg.substr(pos+16); + strResourcesPath = arg.substr(pos+16); else if ( (pos = arg.find("--client=")) != std::string::npos ) clientId=arg.substr(pos+9); else if ( (pos = arg.find("--working-dir=")) != std::string::npos ) @@ -134,27 +143,49 @@ int run(int argc,char** argv) std::cerr << "No language specified. Aborting." << std::endl; return 1; } + if (!strResourcesPath.empty()) + { + resourcesPath = QString::fromUtf8(strResourcesPath.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } + if (!strConfigPath.empty()) + { + configPath = QString::fromUtf8(strConfigPath.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } setlocale(LC_ALL,"fr_FR.UTF-8"); - AbstractLinguisticProcessingClient* client(0); - // initialize common - MediaticData::changeable().init( - resourcesPath, - configDir, + Common::MediaticData::MediaticData::changeable().init( + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); - - // initialize linguistic processing - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); - LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); - - client=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + + bool clientFactoryConfigured = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + lpConfigFile.c_str()).exists()) + { + // initialize linguistic processing + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig((configDir + "/" + lpConfigFile.c_str()).toStdString()); + LinguisticProcessingClientFactory::changeable().configureClientFactory( + clientId, + lpconfig, + langs, + pipelines); + clientFactoryConfigured = true; + break; + } + } + if(!clientFactoryConfigured) + { + std::cerr << "No LinguisticProcessingClientFactory were configured with" << configDirs.join(LIMA_PATH_SEPARATOR).toStdString() << "and" << lpConfigFile << std::endl; + return EXIT_FAILURE; + } + + std::shared_ptr< AbstractLinguisticProcessingClient > client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -165,7 +196,7 @@ int run(int argc,char** argv) BowTextHandler* bowTextHandler = new BowTextHandler(); handlers.insert(std::make_pair("bowTextHandler", bowTextHandler)); - AnalysisTestCaseProcessor analysisTestCaseProcessor(workingDir, client, handlers); + AnalysisTestCaseProcessor analysisTestCaseProcessor(workingDir, client.get(), handlers); QXmlSimpleReader parser; TestCasesHandler tch(analysisTestCaseProcessor); @@ -234,7 +265,6 @@ int run(int argc,char** argv) std::cout << std::endl; tch.m_reportByType.clear(); } - delete client; delete bowTextWriter; delete simpleStreamHandler; delete bowTextHandler; diff --git a/lima_linguisticprocessing/tools/tvr/tvr.cpp b/lima_linguisticprocessing/tools/tvr/tvr.cpp index 0de47839b..25e31e2ae 100644 --- a/lima_linguisticprocessing/tools/tvr/tvr.cpp +++ b/lima_linguisticprocessing/tools/tvr/tvr.cpp @@ -145,8 +145,6 @@ int run(int argc,char** argv) setlocale(LC_ALL,"fr_FR.UTF-8"); - AbstractLinguisticProcessingClient* client(0); - // initialize common MediaticData::changeable().init( resourcesPath, @@ -161,11 +159,11 @@ int run(int argc,char** argv) lpconfig, MediaticData::single().getMedias()); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + std::shared_ptr client=std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); ReaderTestCaseProcessor - readerTestCaseProcessor(workingDir, client); + readerTestCaseProcessor(workingDir, client.get()); QXmlSimpleReader parser; TestCasesHandler tch(readerTestCaseProcessor); From 7ee733980db11d08f8f17ce1856e11a0e685f1ae Mon Sep 17 00:00:00 2001 From: Gael de Chalendar Date: Fri, 2 Jun 2017 12:01:56 +0200 Subject: [PATCH 82/82] Solves issue #59 Correctly handle numeric tokens containing commas by setting the comma character to unmark to itself. --- .../eng/tokenizerAutomaton-eng.chars.tok | 4 ++-- .../fre/tokenizerAutomaton-fre.chars.tok | 2 +- .../data/test-eng.default.xml | 18 ++++++++++++++---- .../data/test-fre.default.xml | 16 +++++++++++++--- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok index 24b0f15db..c4e1d4bb1 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok @@ -59,8 +59,8 @@ chars { 0029, RIGHT PARENTHESIS, c_del1 ; 002A, ASTERISK, c_del1 ; 002B, PLUS SIGN, c_plus ; -002C, COMMA, c_comma ; -002D, HYPHEN-MINUS, m_pattern ; +002C, COMMA, c_comma, u002C ; +002D, HYPHEN-MINUS, m_pattern, u002D ; 002E, FULL STOP, c_dot ; 002F, SOLIDUS, c_slash ; 0030, DIGIT ZERO, c_5, m0030 ; diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok index c223c911c..d3751c9f8 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok @@ -59,7 +59,7 @@ chars { 0029, RIGHT PARENTHESIS, c_del1 ; 002A, ASTERISK, c_del1 ; 002B, PLUS SIGN, c_plus ; -002C, COMMA, c_comma ; +002C, COMMA, c_comma, u002C ; 002D, HYPHEN-MINUS, m_pattern ; 002E, FULL STOP, c_dot ; 002F, SOLIDUS, c_slash ; diff --git a/lima_linguisticprocessing/data/test-eng.default.xml b/lima_linguisticprocessing/data/test-eng.default.xml index 3594aeb78..f23c1712b 100644 --- a/lima_linguisticprocessing/data/test-eng.default.xml +++ b/lima_linguisticprocessing/data/test-eng.default.xml @@ -57,14 +57,19 @@ - + - 24.99 reçoit la categorie num card - 24,99 reçoit la categorie num card + + 24.99 reçoit la categorie num card - + - 24,99 reçoit la categorie num card + 24,99 gets the DET tag and its lemma is itself + @@ -97,7 +102,7 @@ right="euritrack"/> - EURITRACK est un mot inconnu, doit être normalisé 'euritrack' + E.U.R.I.T.R.A.C.K. est un mot inconnu, doit être normalisé 'euritrack' @@ -121,6 +126,11 @@ 24.99 reçoit la categorie num card +