diff --git a/gbuild.sh b/gbuild.sh index f2c875dae..6a2a72591 100755 --- a/gbuild.sh +++ b/gbuild.sh @@ -93,7 +93,7 @@ source_dir=$PWD if [[ $version = "rev" ]]; then release="$current_timestamp-$current_revision" else -release="2" +release="0" fi if [[ $parallel = "true" ]]; then diff --git a/lima_annoqt/src/annoqt.cpp b/lima_annoqt/src/annoqt.cpp index d537ead74..523cafe7c 100644 --- a/lima_annoqt/src/annoqt.cpp +++ b/lima_annoqt/src/annoqt.cpp @@ -405,7 +405,7 @@ bool Annoqt::saveFile( const QString &fileName ) // std::cerr<document()->toHtml("utf-8").toUtf8().data() << std::endl; - foreach (SpecificEntity* entity, m_entities) + Q_FOREACH (SpecificEntity* entity, m_entities) { QString string = entity->string(); QRegExp rxamp("&(?!amp;)"); @@ -463,7 +463,7 @@ Annoqt::~Annoqt() { qDebug() << "Annoqt::~Annoqt"; - foreach (SpecificEntity* se, m_entities) + Q_FOREACH (SpecificEntity* se, m_entities) { delete se; } @@ -801,7 +801,7 @@ void Annoqt::computeEntitiesMap() { qDebug() << "Annoqt::computeEntitiesMap"; m_entitiesMap.clear(); - foreach (SpecificEntity* entity, m_entities) + Q_FOREACH (SpecificEntity* entity, m_entities) { for (quint32 i = entity->position(); i < entity->position()+entity->length(); i++) { diff --git a/lima_annoqt/src/kcolorbutton.cpp b/lima_annoqt/src/kcolorbutton.cpp index c5eb896d2..f6d36a780 100644 --- a/lima_annoqt/src/kcolorbutton.cpp +++ b/lima_annoqt/src/kcolorbutton.cpp @@ -106,7 +106,7 @@ void KColorButton::setColor( const QColor &c ) if ( d->col != c ) { d->col = c; repaint(); - emit changed( d->col ); + Q_EMIT changed( d->col ); } } diff --git a/lima_annoqt/src/kcolorcollection.cpp b/lima_annoqt/src/kcolorcollection.cpp index ac9801f63..d690fdd10 100644 --- a/lima_annoqt/src/kcolorcollection.cpp +++ b/lima_annoqt/src/kcolorcollection.cpp @@ -152,7 +152,7 @@ KColorCollection::save() str << "KDE RGB Palette\n"; str << description << "\n"; - foreach (const KColorCollectionPrivate::ColorNode &node, d->colorList) + Q_FOREACH (const KColorCollectionPrivate::ColorNode &node, d->colorList) { int r,g,b; node.color.getRgb(&r, &g, &b); diff --git a/lima_annoqt/src/kcolorcombo.cpp b/lima_annoqt/src/kcolorcombo.cpp index 52709f990..0b7875c07 100644 --- a/lima_annoqt/src/kcolorcombo.cpp +++ b/lima_annoqt/src/kcolorcombo.cpp @@ -334,7 +334,7 @@ void KColorComboPrivate::_k_slotActivated(int index) internalcolor = colorList[index - 1]; } - emit q->activated(internalcolor); + Q_EMIT q->activated(internalcolor); } void KColorComboPrivate::_k_slotHighlighted(int index) @@ -347,7 +347,7 @@ void KColorComboPrivate::_k_slotHighlighted(int index) internalcolor = colorList[index - 1]; } - emit q->highlighted(internalcolor); + Q_EMIT q->highlighted(internalcolor); } void KColorComboPrivate::addColors() diff --git a/lima_annoqt/src/kcolordialog.cpp b/lima_annoqt/src/kcolordialog.cpp index c31781314..215ee6c98 100644 --- a/lima_annoqt/src/kcolordialog.cpp +++ b/lima_annoqt/src/kcolordialog.cpp @@ -383,7 +383,7 @@ void KColorCells::mouseReleaseEvent(QMouseEvent *e) d->inMouse = false; if (cell != -1) - emit colorSelected(cell , color(cell)); + Q_EMIT colorSelected(cell , color(cell)); } QTableWidget::mouseReleaseEvent(e); @@ -394,7 +394,7 @@ void KColorCells::mouseDoubleClickEvent(QMouseEvent * /*e*/) int cell = positionToCell(d->mousePos); if (cell != -1) - emit colorDoubleClicked(cell , color(cell)); + Q_EMIT colorDoubleClicked(cell , color(cell)); } @@ -455,7 +455,7 @@ void KColorPatch::dropEvent(QDropEvent *event) QColor c = KColorMimeData::fromMimeData(event->mimeData()); if (c.isValid()) { setColor(c); - emit colorChanged(c); + Q_EMIT colorChanged(c); } } @@ -625,12 +625,12 @@ KColorTable::KColorTablePrivate::slotShowNamedColorReadError(void) // // 2000-02-12 Espen Sand -// Set the color in two steps. The setColors() slot will not emit a signal +// Set the color in two steps. The setColors() slot will not Q_EMIT a signal // with the current color setting. The reason is that setColors() is used // by the color selector dialog on startup. In the color selector dialog // we normally want to display a startup color which we specify // when the dialog is started. The slotSetColors() slot below will -// set the palette and then use the information to emit a signal with the +// set the palette and then use the information to Q_EMIT a signal with the // new color setting. It is only used by the combobox widget. // void @@ -729,7 +729,7 @@ KColorTable::KColorTablePrivate::slotColorCellSelected(int index , const QColor& { if (!mPalette || (index >= mPalette->count())) return; - emit q->colorSelected(mPalette->color(index), mPalette->name(index)); + Q_EMIT q->colorSelected(mPalette->color(index), mPalette->name(index)); } void @@ -737,14 +737,14 @@ KColorTable::KColorTablePrivate::slotColorCellDoubleClicked(int index , const QC { if (!mPalette || (index >= mPalette->count())) return; - emit q->colorDoubleClicked(mPalette->color(index), mPalette->name(index)); + Q_EMIT q->colorDoubleClicked(mPalette->color(index), mPalette->name(index)); } void KColorTable::KColorTablePrivate::slotColorTextSelected(const QString &colorText) { - emit q->colorSelected(m_namedColorMap[ colorText ], colorText); + Q_EMIT q->colorSelected(m_namedColorMap[ colorText ], colorText); } @@ -1175,7 +1175,7 @@ void KColorDialog::KColorDialogPrivate::slotDefaultColorClicked() } else { showColor(selColor, QString()); } - emit q->colorSelected(selColor); + Q_EMIT q->colorSelected(selColor); } void @@ -1465,10 +1465,10 @@ void KColorDialog::KColorDialogPrivate::_setColor(const QColor &color, const QSt showColor(selColor, name); - emit q->colorSelected(selColor); + Q_EMIT q->colorSelected(selColor); } -// show but don't set into selColor, nor emit colorSelected +// show but don't set into selColor, nor Q_EMIT colorSelected void KColorDialog::KColorDialogPrivate::showColor(const QColor &color, const QString &name) { bRecursion = true; diff --git a/lima_annoqt/src/kcolordialog.h b/lima_annoqt/src/kcolordialog.h index d3988cef9..297cc0a0b 100644 --- a/lima_annoqt/src/kcolordialog.h +++ b/lima_annoqt/src/kcolordialog.h @@ -226,8 +226,8 @@ class KColorSpinBox : public QSpinBox virtual void valueChange() { updateDisplay(); - emit valueChanged( value() ); - emit valueChanged( currentValueText() ); + Q_EMIT valueChanged( value() ); + Q_EMIT valueChanged( currentValueText() ); }*/ }; diff --git a/lima_annoqt/src/kcolorvalueselector.cpp b/lima_annoqt/src/kcolorvalueselector.cpp index 8a38c569c..ff525151a 100644 --- a/lima_annoqt/src/kcolorvalueselector.cpp +++ b/lima_annoqt/src/kcolorvalueselector.cpp @@ -108,7 +108,7 @@ void KColorValueSelector::setChooserMode( KColorChooserMode c ) d->_mode = c; //really needed? - //emit modeChanged(); + //Q_EMIT modeChanged(); } KColorChooserMode KColorValueSelector::chooserMode () const diff --git a/lima_annoqt/src/kxyselector.cpp b/lima_annoqt/src/kxyselector.cpp index 92cf6a128..d372b1bad 100644 --- a/lima_annoqt/src/kxyselector.cpp +++ b/lima_annoqt/src/kxyselector.cpp @@ -188,7 +188,7 @@ void KXYSelector::mouseMoveEvent( QMouseEvent *e ) valuesFromPosition( e->pos().x() - w, e->pos().y() - w, xVal, yVal ); setValues( xVal, yVal ); - emit valueChanged( d->xPos, d->yPos ); + Q_EMIT valueChanged( d->xPos, d->yPos ); } void KXYSelector::wheelEvent( QWheelEvent *e ) @@ -198,7 +198,7 @@ void KXYSelector::wheelEvent( QWheelEvent *e ) else setValues( xValue(), yValue() + e->delta()/120 ); - emit valueChanged( d->xPos, d->yPos ); + Q_EMIT valueChanged( d->xPos, d->yPos ); } void KXYSelector::valuesFromPosition( int x, int y, int &xVal, int &yVal ) const diff --git a/lima_annoqt/src/specificEntity.cpp b/lima_annoqt/src/specificEntity.cpp index 4d624b77f..f41e02358 100644 --- a/lima_annoqt/src/specificEntity.cpp +++ b/lima_annoqt/src/specificEntity.cpp @@ -55,6 +55,6 @@ SpecificEntity& SpecificEntity::operator=(const SpecificEntity& se) void SpecificEntity::slotTriggered() { qDebug() << "SpecificEntity::slotTriggered"; - emit triggered( this ); + Q_EMIT triggered( this ); } diff --git a/lima_antinno/src/antinno.ResourcesIdent.h b/lima_antinno/src/antinno.ResourcesIdent.h new file mode 100644 index 000000000..39f9d0c93 --- /dev/null +++ b/lima_antinno/src/antinno.ResourcesIdent.h @@ -0,0 +1,101 @@ + +#ifndef ghdghscjenicfhermfuchhmfmaixfxdsqksdogqùjefqojxefoejkg +#define ghdghscjenicfhermfuchhmfmaixfxdsqksdogqùjefqojxefoejkg + +/* +Code copié de AntResourcesIdent.h + +Sans doute à améliorer + +FW 30/10/2013 + + +*/ +// +// C++ Implementation : AntResourcesIdent +// +// Description: analyse les identifiants d'une ressource binaire Ant'inno +// +// Author: Jean-Yves Sage , (C) 2010-2011 +// +// Copyright: See COPYING file that comes with this distribution +// +//////////////////////////////////////////////////////////// +#include +#include +#include + +namespace antinno { + +//brief This class extracts identifiers from binary resources files + +class ResourcesIdent +{ +public: + //------------------------------------------------------------------------------------------------------------------------------ + //param header header in memory + //param headerSize header size (for check) + ResourcesIdent(const char *header, const ::std::size_t headerSize) + : _pHeader(header), _pHeaderSize(headerSize) + { + } + //------------------------------------------------------------------------------------------------------------------------------ + //return string ready to display */ + ::std::string toHumanReadableString() + { + unsigned char *currentPtr = (unsigned char*)_pHeader; + //UNSIGNED indispensable pour calculer les valeurs des entiers + ::std::ostringstream resultoss; + //lit les noms d'identifiants + const ::std::size_t namesSize = _readInt4LE(currentPtr); + const ::std::string names = ::std::string((char*)currentPtr, namesSize); + currentPtr += namesSize; + //lit les valeurs + + const ::std::size_t valuesNb = _readInt4LE(currentPtr) / 4; + ::std::size_t ptrb = 0; + for (::std::size_t i=0; i + + + + diff --git a/lima_common/src/common/AbstractFactoryPattern/AbstractFactoryPatternExport.h b/lima_common/src/common/AbstractFactoryPattern/AbstractFactoryPatternExport.h index 7a157fa0d..c15ba32b7 100644 --- a/lima_common/src/common/AbstractFactoryPattern/AbstractFactoryPatternExport.h +++ b/lima_common/src/common/AbstractFactoryPattern/AbstractFactoryPatternExport.h @@ -32,7 +32,7 @@ #define LIMA_ABSTRACTFACTORYPATTERNEXPORT_H -#include +#include #ifdef WIN32 diff --git a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp index 93b773ab7..64ce209d1 100644 --- a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp +++ b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.cpp @@ -19,6 +19,7 @@ #include "AmosePluginsManager.h" #include "common/LimaCommon.h" #include "common/AbstractFactoryPattern/DynamicLibrariesManager.h" +#include "common/tools/FileUtils.h" #include #include @@ -26,49 +27,68 @@ using namespace Lima; using namespace Lima::Common; +using namespace Lima::Common::Misc; AmosePluginsManager::AmosePluginsManager() { loadPlugins(); } -bool AmosePluginsManager::loadPlugins() +bool AmosePluginsManager::loadPlugins(const QString& configDirs) { - ABSTRACTFACTORYPATTERNLOGINIT; - LINFO << "AmosePluginsManager::loadPlugins"; +// ABSTRACTFACTORYPATTERNLOGINIT; +// LINFO << "AmosePluginsManager::loadPlugins"; // DynamicLibrariesManager::changeable().addSearchPath("c:\amose\lib");; // open LIMA_CONF/plugins file - QDir pluginsDir(QString::fromUtf8(qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData()) + "/plugins"); - QStringList pluginsFiles = pluginsDir.entryList(QDir::Files); - Q_FOREACH(QString pluginsFile, pluginsFiles) + + QStringList configDirsList = configDirs.split(LIMA_PATH_SEPARATOR); + if (configDirsList.isEmpty()) { -#ifdef DEBUG_CD - LDEBUG << "AmosePluginsManager::loadPlugins loding plugins file " << pluginsFile.toUtf8().data(); + // Look for LIMA_CONF directory. + configDirsList = buildConfigurationDirectoriesList(QStringList() << "lima", QStringList()); + } +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(const QString& configDir, configDirsList) +#else + for(const QString& configDir : configDirsList) #endif - QFile file(pluginsDir.path() + "/" + pluginsFile); - if (!file.open(QIODevice::ReadOnly)) - return false; - // for each entry, call load library - while (!file.atEnd()) + { + // Deduce plugins directory. + QString stdPluginsDir(configDir); + stdPluginsDir.append("/plugins"); + QDir pluginsDir(stdPluginsDir); + + // For each file under plugins directory, read plugins names and deduce shared libraries to load. + QStringList pluginsFiles = pluginsDir.entryList(QDir::Files); + Q_FOREACH(QString pluginsFile, pluginsFiles) { - QByteArray line = file.readLine(); - if (line.endsWith('\n')) line.chop(1); - // Allows empty and comment lines - if ( !line.isEmpty() && !line.startsWith('#') ) +// #ifdef DEBUG_CD +// LDEBUG << "AmosePluginsManager::loadPlugins loading plugins file " << pluginsFile.toUtf8().data(); +// #endif + // Open plugin file. + QFile file(pluginsDir.path() + "/" + pluginsFile); + if (!file.open(QIODevice::ReadOnly)) { + ABSTRACTFACTORYPATTERNLOGINIT; + LERROR << "AmosePluginsManager::loadPlugins: cannot open plugins file " << pluginsFile.toUtf8().data(); + return false; + } + + // For each entry, call load library + while (!file.atEnd()) { -#ifdef WIN32 - QString strline = QString(line.data()).trimmed() + ".dll"; - QString library_path=QString::fromUtf8(qgetenv("LD_LIBRARY_PATH").constData()==0?"c:\amose\lib":qgetenv("LD_LIBRARY_PATH").constData()); - DynamicLibrariesManager::changeable().addSearchPathes( library_path.toUtf8().data()); -#else - QString strline = QString("lib") + line.data() + ".so"; -#endif -#ifdef DEBUG_CD - LDEBUG << "AmosePluginsManager::loadPlugins loading plugin '" << line.data() << "'"; -#endif - DynamicLibrariesManager::changeable().loadLibrary(line.data()); + // Remove whitespace characters from the start and the end. + QString line = QString(file.readLine()).trimmed(); + + // Allow empty and comment lines. + if ( !line.isEmpty() && !line.startsWith('#') ) + { +// #ifdef DEBUG_CD +// LDEBUG << "AmosePluginsManager::loadPlugins loading plugin '" << line.toStdString().c_str() << "'"; +// #endif + DynamicLibrariesManager::changeable().loadLibrary(line.toStdString().c_str()); + } } } } return true; -} +} \ No newline at end of file diff --git a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h index fa43178d2..ca2bf6725 100644 --- a/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h +++ b/lima_common/src/common/AbstractFactoryPattern/AmosePluginsManager.h @@ -22,6 +22,8 @@ #include "common/AbstractFactoryPattern/AbstractFactoryPatternExport.h" #include "common/AbstractFactoryPattern/Singleton.h" +#include + namespace Lima { @@ -29,12 +31,17 @@ class LIMA_FACTORY_EXPORT AmosePluginsManager : public Singleton { friend class Singleton; +public: + virtual ~AmosePluginsManager() {} + + /** Load plugins in the plugins subdir of the semicolon separated config dirs + * @param configDirs semicolon separated list of config dirs. If empty, loads a default location + */ + bool loadPlugins(const QString& configDirs = ""); private: AmosePluginsManager(); - virtual ~AmosePluginsManager() {} - bool loadPlugins(); }; } diff --git a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp index 95bfbdbcd..aa732038b 100644 --- a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp +++ b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.cpp @@ -31,63 +31,78 @@ #include #include #include -#include +#ifdef ANTINNO_SPECIFIC +// FWI 17/08/2015 : désactivé car n'existe pas dans QT4 +#else +#include +#endif using namespace std; namespace Lima { namespace Common { -DynamicLibrariesManager::DynamicLibrariesManager() +class DynamicLibrariesManagerPrivate +{ +friend class DynamicLibrariesManager; + DynamicLibrariesManagerPrivate(); + + std::map > m_handles; + // at load time, will try to load the libraries from these paths before the default ones + std::vector m_supplementarySearchPath; +}; + +DynamicLibrariesManagerPrivate::DynamicLibrariesManagerPrivate() : + m_handles(), + m_supplementarySearchPath() +{ +} + + +DynamicLibrariesManager::DynamicLibrariesManager() : m_d(new DynamicLibrariesManagerPrivate()) { } DynamicLibrariesManager::~DynamicLibrariesManager() { - for (std::map::iterator - it=m_handles.begin(),it_end=m_handles.end(); it!=it_end; it++) - { - delete (*it).second; - } } bool DynamicLibrariesManager:: isLoaded(const std::string& libName) { - std::map::const_iterator - it=m_handles.find(libName); - return (it!=m_handles.end()); + auto it=m_d->m_handles.find(libName); + return (it!=m_d->m_handles.end()); } -bool DynamicLibrariesManager:: -loadLibrary(const std::string& libName) +bool DynamicLibrariesManager::loadLibrary(const std::string& libName) { #ifdef DEBUG_CD ABSTRACTFACTORYPATTERNLOGINIT; LDEBUG <<"DynamicLibrariesManager::loadLibrary() -- "<<"libName="<::const_iterator - it=m_handles.find(libName); - if (it!=m_handles.end()) { + auto it=m_d->m_handles.find(libName); + if (it!=m_d->m_handles.end()) { #ifdef DEBUG_CD - LWARN << "DEBUG_CD: trying to reload dynamic library " << libName.c_str(); + LDEBUG << "DynamicLibrariesManager::loadLibrary trying to reload dynamic library" << libName.c_str(); + return false; #endif } - QLibrary* libhandle = 0; + std::shared_ptr< QLibrary > libhandle; // try supplementary search path - for (std::vector::const_iterator it = m_supplementarySearchPath.begin(); it != m_supplementarySearchPath.end(); it++) + for (auto it = m_d->m_supplementarySearchPath.begin(); it != m_d->m_supplementarySearchPath.end(); it++) { #ifdef DEBUG_FACTORIES LDEBUG << "Trying supplementary " << ((*it)+"/"+libName).c_str(); #endif - libhandle = new QLibrary( ((*it)+"/"+libName).c_str() ); + libhandle = std::shared_ptr< QLibrary >(new QLibrary( ((*it)+"/"+libName).c_str() )); libhandle->setLoadHints(QLibrary::ResolveAllSymbolsHint | QLibrary::ExportExternalSymbolsHint); if (libhandle->load()) { - m_handles.insert(std::make_pair(libName,libhandle)); + m_d->m_handles.insert(std::make_pair(libName,libhandle)); #ifdef DEBUG_CD - LDEBUG << "the library " << libName.c_str() << " was loaded"; + LDEBUG << "the library " << libName.c_str() << " was loaded from supplementary search path"; + LDEBUG << "the library fully-qualified name: " << libhandle->fileName(); #endif return true; } @@ -96,8 +111,6 @@ loadLibrary(const std::string& libName) // if ( QLibrary::isLibrary(((*it)+"/"+libName).c_str()) ) ABSTRACTFACTORYPATTERNLOGINIT; LERROR <<"DynamicLibrariesManager::loadLibrary() -- "<<"Failed to open lib " << libhandle->errorString().toUtf8().data(); - delete libhandle; - libhandle = 0; } } // now try system default search path @@ -106,13 +119,14 @@ loadLibrary(const std::string& libName) #ifdef DEBUG_FACTORIES LINFO << "Trying " << libName.c_str(); #endif - libhandle = new QLibrary( libName.c_str() ); + libhandle = std::shared_ptr( new QLibrary( libName.c_str() ) ); libhandle->setLoadHints(QLibrary::ResolveAllSymbolsHint | QLibrary::ExportExternalSymbolsHint); if (libhandle->load()) { - m_handles.insert(std::make_pair(libName,libhandle)); + m_d->m_handles.insert(std::make_pair(libName,libhandle)); #ifdef DEBUG_CD - LDEBUG << "the library " << libName.c_str() << " was loaded"; + LDEBUG << "the library " << libName.c_str() << " was loaded from system default search path"; + LDEBUG << "the library fully-qualified name: " << libhandle->fileName(); #endif return true; } @@ -120,13 +134,11 @@ loadLibrary(const std::string& libName) { ABSTRACTFACTORYPATTERNLOGINIT; LINFO <<"DynamicLibrariesManager::loadLibrary() -- "<< "Failed to open lib " << libhandle->errorString().toUtf8().data(); - delete libhandle; - libhandle = 0; return false; } } else { - m_handles[libName]=libhandle; + m_d->m_handles[libName]=libhandle; #ifdef DEBUG_CD LDEBUG << "the library " << libName.c_str() << " was loaded"; #endif @@ -137,7 +149,7 @@ loadLibrary(const std::string& libName) void DynamicLibrariesManager:: addSearchPath(const std::string& searchPath) { - if(std::find(m_supplementarySearchPath.begin(), m_supplementarySearchPath.end(), searchPath)!=m_supplementarySearchPath.end()){ + if(std::find(m_d->m_supplementarySearchPath.begin(), m_d->m_supplementarySearchPath.end(), searchPath)!=m_d->m_supplementarySearchPath.end()){ return; } #ifdef DEBUG_CD @@ -145,7 +157,7 @@ addSearchPath(const std::string& searchPath) LINFO << "adding search path '"<m_supplementarySearchPath.push_back(searchPath); } @@ -155,7 +167,12 @@ addSearchPathes(QString searchPathes) #ifdef DEBUG_CD ABSTRACTFACTORYPATTERNLOGINIT; #endif +#ifdef ANTINNO_SPECIFIC + // FWI 17/08/2015 : ligne modifiée car QRegularExpression n'existe pas dans QT4 + QStringList list = searchPathes.replace("\\","/").split(";", QString::SkipEmptyParts); +#else QStringList list = searchPathes.replace("\\","/").split(QRegularExpression("[;]"), QString::SkipEmptyParts); +#endif for(QStringList::iterator it = list.begin(); it!=list.end();++it) { QString searchPath = *it; diff --git a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h index a3be72de3..fcb9af768 100644 --- a/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h +++ b/lima_common/src/common/AbstractFactoryPattern/DynamicLibrariesManager.h @@ -41,16 +41,19 @@ #include #include #include +#include class QString; namespace Lima { namespace Common { +class DynamicLibrariesManagerPrivate; class LIMA_FACTORY_EXPORT DynamicLibrariesManager: public Singleton { friend class Singleton; - public: + +public: ~DynamicLibrariesManager(); bool isLoaded(const std::string& libName); @@ -60,10 +63,8 @@ friend class Singleton; private: DynamicLibrariesManager(); - - std::map m_handles; - // at load time, will try to load the libraries from these paths before the default ones - std::vector m_supplementarySearchPath; + + std::unique_ptr m_d; }; } // end namespace diff --git a/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h b/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h index dade5ab30..8f3060765 100644 --- a/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h +++ b/lima_common/src/common/AbstractFactoryPattern/InitializableObject.h @@ -43,6 +43,10 @@ template class InitializableObject { public: + InitializableObject() : m_id() {} + InitializableObject(const InitializableObject& object) { m_id = object.m_id; } + InitializableObject& operator=(const InitializableObject& object) { m_id = object.m_id; return *this; } + /** * Manager is the type of the Manager associated to the initializableObject. * This type is an instanciation of InitializableObjectManager template with diff --git a/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h b/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h index dfde4b494..8e2fe3957 100644 --- a/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h +++ b/lima_common/src/common/AbstractFactoryPattern/ProcessingClientFactory.h @@ -66,7 +66,7 @@ class ProcessingClientFactory * ClientFactory must have been configured before this method is called * Use configureClientFactory() method to configure. */ - virtual AbstractProcessingClient* createClient(const std::string& id) const = 0; + virtual std::shared_ptr< AbstractProcessingClient > createClient(const std::string& id) const = 0; /** * @brief show registered clientId @@ -89,7 +89,7 @@ class ProcessingClientFactoryFactory: public Singleton createProcessingClientFactory(const std::string& id) const ; private: ProcessingClientFactoryFactory() {}; }; @@ -97,8 +97,8 @@ class ProcessingClientFactoryFactory: public Singleton { public: - virtual ~AbstractProcessingClientFactoryFactory() {std::cerr << "~AbstractExtractorFactory()" << std::endl;}; - virtual ProcessingClientFactory* createProcessingClientFactory() const = 0; + virtual ~AbstractProcessingClientFactoryFactory() {}; + virtual std::shared_ptr< ProcessingClientFactory > createProcessingClientFactory() const = 0; protected: AbstractProcessingClientFactoryFactory(const std::string& id): RegistrableFactory(id) {}; diff --git a/lima_common/src/common/AbstractFactoryPattern/Singleton.h b/lima_common/src/common/AbstractFactoryPattern/Singleton.h index 009e2c577..77358757a 100644 --- a/lima_common/src/common/AbstractFactoryPattern/Singleton.h +++ b/lima_common/src/common/AbstractFactoryPattern/Singleton.h @@ -1,5 +1,5 @@ /* - Copyright 2002-2013 CEA LIST + Copyright 2002-2016 CEA LIST This file is part of LIMA. @@ -16,13 +16,10 @@ You should have received a copy of the GNU Affero General Public License along with LIMA. If not, see */ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ #ifndef LIMA_MISC_SINGLETON_H #define LIMA_MISC_SINGLETON_H +#include namespace Lima { @@ -55,20 +52,20 @@ class Singleton private: - static Object* s_instance; + static std::unique_ptr< Object > s_instance; Singleton(const Singleton&) {} }; template -Object* Singleton::s_instance(0); +std::unique_ptr< Object > Singleton::s_instance(new Object()); template const Object& Singleton::single() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } return *s_instance; } @@ -78,9 +75,9 @@ const Object* Singleton::psingle() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } - return s_instance; + return s_instance.get(); } template @@ -88,7 +85,7 @@ Object& Singleton::changeable() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } return *s_instance; } @@ -98,9 +95,9 @@ Object* Singleton::pchangeable() { if (s_instance==0) { - s_instance=new Object(); + s_instance=std::unique_ptr< Object >(new Object()); } - return s_instance; + return s_instance.get(); } } // Lima diff --git a/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp new file mode 100644 index 000000000..a4d24875c --- /dev/null +++ b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.cpp @@ -0,0 +1,97 @@ + +#include "antinno.LibraryLoader.class.h" +#include "common/LimaCommon.h" +#include +#include +#include +#ifdef WIN32 + #define WIN32_LEAN_AND_MEAN + #include + #ifdef ERROR + //#undef ERROR + #endif +#endif + +namespace Lima { namespace antinno { + +#ifdef WIN32 +class SystemMsg +{ +public: + SystemMsg(DWORD msgId) : _msgId(msgId), _lpMsgBuf(NULL) + { + DWORD msgBufLen = ::FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | // max 64K bytes + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, // lpSource (optional) + _msgId, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR) &_lpMsgBuf, + 0, NULL ); + if (msgBufLen != 0) + _s.assign(static_cast<::std::wstring::value_type const*>(_lpMsgBuf)); + else // traitement par défaut mais perfectible + _s.clear(); + } + ::std::string toUtf8String() const + { + return ::boost::locale::conv::utf_to_utf(_s); + } + ~SystemMsg() + { + LocalFree(_lpMsgBuf); + } +private: + LPVOID _lpMsgBuf; + DWORD _msgId; + ::std::wstring _s; +}; +#else + #error no implementation for non-win32 systems +#endif + +LibraryLoader::LibraryLoader() +{ +} +void LibraryLoader::loadFromFile(::std::string const& filePath) +{ + ABSTRACTFACTORYPATTERNLOGINIT + ::std::ifstream in(filePath); + if (!in) + throw ::std::exception((::std::string("Cannot open file (read mode): ") + filePath).data()); + ::std::string line; + while (::std::getline(in, line)) + { + ::boost::algorithm::trim(line); + if (line.size() > 1 && line[0] != '#') // skip comment lines beginning with "#" + { +#ifdef WIN32 + ::std::string const path = line + ".dll"; +#else +#error no implementation for non-win32 systems +#endif +#ifdef WIN32 + if (NULL != /*win32*/::LoadLibrary(path.c_str())) +#else +#error no implementation for non-win32 systems +#endif + { + ::std::cout << L"Plugin successfully loaded: " << path << ::std::endl; + LDEBUG << "Plugin successfully loaded: " << path; + } + else + { +#ifdef WIN32 + auto const msgId = ::GetLastError(); +#else +#error no implementation for non-win32 systems +#endif + ::std::cout << L"Plugin loading failed: " << line << " : (err windows " << msgId << ") " << SystemMsg(msgId).toUtf8String() << ::std::endl; + LDEBUG << L"Plugin loading failed: " << line << " : (err windows " << msgId << ") " << SystemMsg(msgId).toUtf8String(); + } + } + } +} + +}} diff --git a/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h new file mode 100644 index 000000000..e9e4f76f9 --- /dev/null +++ b/lima_common/src/common/AbstractFactoryPattern/antinno.LibraryLoader.class.h @@ -0,0 +1,15 @@ + +#pragma once + +#include "common/AbstractFactoryPattern/AbstractFactoryPatternExport.h" + +namespace Lima { namespace antinno { + +class LIMA_FACTORY_EXPORT LibraryLoader +{ +public: + LibraryLoader::LibraryLoader(); + void loadFromFile(::std::string const& filePath); +}; + +}} \ No newline at end of file diff --git a/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h b/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h index bee4cfccb..36cfd839b 100644 --- a/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h +++ b/lima_common/src/common/AbstractProcessingClient/AbstractProcessingClient.h @@ -32,7 +32,12 @@ namespace Lima class AbstractProcessingClient { public: - +#ifdef ANTINNO_SPECIFIC + // FWI 13/03/2015 : jout 3 méthodes sinon erreur de link + AbstractProcessingClient() {} + AbstractProcessingClient(AbstractProcessingClient const&) {} + AbstractProcessingClient& operator=(AbstractProcessingClient const&) { return *this; } +#endif //! @brief Define the destructor virtual to ensure concrete client destructors to be called virtual ~AbstractProcessingClient() {} @@ -47,7 +52,12 @@ class AbstractProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), + Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const = 0; +#else const std::set& inactiveUnits = std::set()) const = 0; +#endif }; @@ -91,7 +101,7 @@ class AbstractProcessingClientFactory /** * This function create a LinguisticProcessing client */ - virtual AbstractProcessingClient* createClient() const = 0; + virtual std::shared_ptr< AbstractProcessingClient > createClient() const = 0; /** * virtual destructor of the LinguisticProcessing client factory diff --git a/lima_common/src/common/Data/DataTypes.cpp b/lima_common/src/common/Data/DataTypes.cpp index afb7a1bea..417055c72 100644 --- a/lima_common/src/common/Data/DataTypes.cpp +++ b/lima_common/src/common/Data/DataTypes.cpp @@ -409,7 +409,7 @@ std::ostream& operator<<(ostream& os, const Node& node) QDebug& operator<<(QDebug& os, const Node& node) { - os<<"Node "<* nodes=structure.getNodes(); - os << "Structure: " << structure.getStructId() << " ; nodes ("<size()<<"): "; + os << "Structure( structId:" << structure.getStructId() << ", nodes ("<size()<<"): "; for (map::const_iterator ItrNodes = nodes->begin(); ItrNodes != nodes->end() ; ItrNodes++) { - os<<"node ("<first<<"):" << ItrNodes->second; + os<<"node ("<first<<":" << ItrNodes->second<<")"; } return os; } diff --git a/lima_common/src/common/Data/LimaString.cpp b/lima_common/src/common/Data/LimaString.cpp index a786126ab..89b2d1720 100644 --- a/lima_common/src/common/Data/LimaString.cpp +++ b/lima_common/src/common/Data/LimaString.cpp @@ -19,23 +19,27 @@ /** * @file LimaString.cpp * @date Created on : Thu Oct 9, 2003 - * @author Gael de Chalendar - + * @author Gael de Chalendar \n * Copyright (c) 2003-2012 by CEA LIST * @version $Id$ */ #include "LimaString.h" - namespace Lima { - +#ifdef ANTINNO_SPECIFIC +// FWI 19/05/2016 : supprimé car défini dans LimaCommon.h +// std::ostream& operator<<(std::ostream &os, const LimaString& s) +// { +// os << s.toUtf8().data(); +// return os; +// } +#else std::ostream& operator<<(std::ostream &os, const LimaString& s) { os << s.toUtf8().data(); return os; } - - +#endif } // closing namespace Lima diff --git a/lima_common/src/common/Data/LimaString.h b/lima_common/src/common/Data/LimaString.h index 7280d6f36..d56cec97b 100644 --- a/lima_common/src/common/Data/LimaString.h +++ b/lima_common/src/common/Data/LimaString.h @@ -1,4 +1,4 @@ -/* +/* Copyright 2002-2013 CEA LIST This file is part of LIMA. @@ -37,7 +37,12 @@ namespace Lima typedef QChar LimaChar; typedef QString LimaString; -LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#ifdef ANTINNO_SPECIFIC + // FWI 19/05/2016 : supprimé car défini dans LimaCommon.h + //LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#else + LIMA_DATA_EXPORT std::ostream& operator<<(std::ostream &os, const LimaString& s); +#endif } // closing namespace Lima diff --git a/lima_common/src/common/Data/genericDocumentProperties.cpp b/lima_common/src/common/Data/genericDocumentProperties.cpp index 069b695c2..d1e4a75bf 100644 --- a/lima_common/src/common/Data/genericDocumentProperties.cpp +++ b/lima_common/src/common/Data/genericDocumentProperties.cpp @@ -334,47 +334,62 @@ void GenericDocumentProperties::read(std::istream& file) { m_d->m_multipleStringValues.clear(); m_d->m_multipleWeightedPropValues.clear(); -// BOWLOGINIT; - +#ifdef DEBUG_CD + BOWLOGINIT; +#endif // read integer properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_intValues.insert(std::pair(name,val)); } // read string properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_stringValues.insert(std::pair(name,str) ); } // read date properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); ireadDate(file); string strDate=d.toString().toUtf8().data(); -// LDEBUG << "read date " << strDate.c_str() << " as value of " << name.c_str(); +#ifdef DEBUG_CD + LDEBUG << "read date " << strDate.c_str() << " as value of " << name.c_str(); +#endif m_d->m_dateValues.insert(std::pair(name,d)); } // read date interval properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); ireadDate(file); string strStartDate=startD.toString().toUtf8().data(); string strEndDate=endD.toString().toUtf8().data(); -// LDEBUG << "read interval [" << strStartDate.c_str() << "," << strEndDate.c_str() << " as value of " << name.c_str(); +#ifdef DEBUG_CD + LDEBUG << "read interval [" << strStartDate.c_str() << "," << strEndDate.c_str() << " as value of " << name.c_str(); +#endif std::pair interval(startD,endD); m_d->m_dateIntervalValues.insert(std::pair >(name,interval)); } // read multi-valued string properties file.read((char*) &size, sizeof(uint32_t)); -// LDEBUG << "read size " << size; +#ifdef DEBUG_CD + LDEBUG << "read size " << size; +#endif for (uint32_t i(0); im_multipleStringValues.insert(std::pair >(name,val) ); diff --git a/lima_common/src/common/Data/tests/FileUtilsTest.cpp b/lima_common/src/common/Data/tests/FileUtilsTest.cpp index a8c72bd0c..ec599c1a0 100644 --- a/lima_common/src/common/Data/tests/FileUtilsTest.cpp +++ b/lima_common/src/common/Data/tests/FileUtilsTest.cpp @@ -18,7 +18,7 @@ */ #include "FileUtilsTest.h" -#include "common/Data/FileUtils.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLogCategories.h" #include diff --git a/lima_common/src/common/FsaAccess/CompoundStringAccess.h b/lima_common/src/common/FsaAccess/CompoundStringAccess.h index 27c83a41c..5fd26b73c 100644 --- a/lima_common/src/common/FsaAccess/CompoundStringAccess.h +++ b/lima_common/src/common/FsaAccess/CompoundStringAccess.h @@ -114,10 +114,6 @@ CompoundStringAccess::CompoundStringAccess( bool trie_dire template CompoundStringAccess::~CompoundStringAccess() { -#ifdef DEBUG_CD - COMPSTRACCESSLOGINIT; - LDEBUG << "CompoundStringAccess::~CompoundStringAccess()"; -#endif } template diff --git a/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h b/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h index 676aa996f..1c52960be 100644 --- a/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h +++ b/lima_common/src/common/FsaAccess/FsaAccessIOHandler.h @@ -26,7 +26,7 @@ #ifndef FSA_IO_HANDLER_HPP #define FSA_IO_HANDLER_HPP -#include +#include #include namespace Lima { diff --git a/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp b/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp index 09638eefa..802331c3d 100644 --- a/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp +++ b/lima_common/src/common/FsaAccess/FsaAccessSpare16.cpp @@ -56,10 +56,6 @@ FsaAccessSpare16::FsaAccessSpare16(bool trie_direction_fwd) FsaAccessSpare16::~FsaAccessSpare16() { -#ifdef DEBUG_CD - FSAALOGINIT; - LDEBUG << "FsaAccessSpare16::~FsaAccessSpare16()"; -#endif } FsaAccessIOHandler* diff --git a/lima_common/src/common/Handler/AbstractDocumentHandler.h b/lima_common/src/common/Handler/AbstractDocumentHandler.h index 6dca001e7..39a913dad 100644 --- a/lima_common/src/common/Handler/AbstractDocumentHandler.h +++ b/lima_common/src/common/Handler/AbstractDocumentHandler.h @@ -45,17 +45,17 @@ class AbstractDocumentHandler //! @brief destructor virtual ~AbstractDocumentHandler(){}; - virtual void writeDocumentsHeader(){}; - virtual void writeDocumentsFooter(){}; - - virtual void openSNode(const Lima::Common::Misc::GenericDocumentProperties* properties, - const std::string& elementName) = 0; - virtual void openSIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, - const std::string& elementName) = 0; - virtual void processProperties(const Lima::Common::Misc::GenericDocumentProperties* properties, bool useIterators) = 0; - virtual void closeSNode() = 0; - virtual void processSContent( const Lima::Common::Misc::GenericDocumentProperties* /*properties*/ ){}; - virtual void closeSContent(){}; + virtual void writeDocumentsHeader(){}; + virtual void writeDocumentsFooter(){}; + + virtual void openSNode(const Lima::Common::Misc::GenericDocumentProperties* properties, + const std::string& elementName) = 0; + virtual void openSIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, + const std::string& elementName) = 0; + virtual void processProperties(const Lima::Common::Misc::GenericDocumentProperties* properties, bool useIterator, bool useIndexIterator) = 0; + virtual void closeSNode() = 0; + virtual void processSContent( const Lima::Common::Misc::GenericDocumentProperties* /*properties*/ ){}; + virtual void closeSContent(){}; }; } // namespace Lima diff --git a/lima_common/src/common/Handler/AbstractProcessingClientHandler.h b/lima_common/src/common/Handler/AbstractProcessingClientHandler.h index bac840b08..2dfaa9ed5 100644 --- a/lima_common/src/common/Handler/AbstractProcessingClientHandler.h +++ b/lima_common/src/common/Handler/AbstractProcessingClientHandler.h @@ -31,7 +31,7 @@ class AbstractProcessingClientHandler public: virtual ~AbstractProcessingClientHandler() {} - inline virtual void setAnalysisClient(const std::string& clientId, AbstractProcessingClient* client) + inline virtual void setAnalysisClient(const std::string& clientId, std::shared_ptr< AbstractProcessingClient > client) { if (m_clients.find(clientId)!=m_clients.end()) { @@ -43,7 +43,7 @@ class AbstractProcessingClientHandler m_clients.insert(std::make_pair(clientId, client)); } - inline virtual AbstractProcessingClient* getAnalysisClient(const std::string& clientId) + inline virtual std::shared_ptr< AbstractProcessingClient > getAnalysisClient(const std::string& clientId) { if (m_clients.find(clientId)==m_clients.end()) { @@ -56,19 +56,28 @@ class AbstractProcessingClientHandler return m_clients[clientId]; } - inline virtual std::map getAnalysisClients() const {return m_clients;}; - inline virtual void setAnalysisClients(std::map clients){m_clients=clients;}; + inline virtual std::map > getAnalysisClients() const {return m_clients;}; + inline virtual void setAnalysisClients(std::map > clients){m_clients=clients;}; virtual void handleProc( const std::string& tagName, const std::string& content, const std::map& metaData, const std::string& pipeline, const std::map& handlers = std::map(), - const std::set& inactiveUnits = std::set()) + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + , Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) { ABSTRACTPROCESSINGCLIENTLOGINIT; - LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits); +#ifdef ANTINNO_SPECIFIC + LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits, stopAnalyze); +#else + LDEBUG << "handleProc("<analyze(content, metaData,pipeline,handlers,inactiveUnits); +#endif } // inline virtual void setAnalysisHandler(const std::string& handlerId, AbstractAnalysisHandler* handler) @@ -83,9 +92,9 @@ class AbstractProcessingClientHandler private: //! @brief list of handlers available - std::map m_clients; + std::map > m_clients; }; } -#endif +#endif \ No newline at end of file diff --git a/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h b/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h index d821fcadf..acc147623 100644 --- a/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h +++ b/lima_common/src/common/Handler/AbstractXmlAnalysisHandler.h @@ -68,4 +68,4 @@ class AbstractXmlAnalysisHandler : public AbstractAnalysisHandler } // Lima -#endif +#endif \ No newline at end of file diff --git a/lima_common/src/common/LimaCommon.cpp b/lima_common/src/common/LimaCommon.cpp index d48df0c89..70cc933f5 100644 --- a/lima_common/src/common/LimaCommon.cpp +++ b/lima_common/src/common/LimaCommon.cpp @@ -1,45 +1,85 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -#include "common/LimaCommon.h" - -#ifdef WIN32 - -#ifdef LIMA_COMMON_EXPORTING -#define LIMA_COMMON_EXPORT __declspec(dllexport) -#else -#define LIMA_COMMON_EXPORT __declspec(dllimport) -#endif - - -#else // Not WIN32 - -#define LIMA_COMMON_EXPORT - -#endif - - -namespace Lima -{ -namespace Common -{ - -LIMA_COMMON_EXPORT void fakeSymbolFoWindowsLinking() {} - -} -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#include "common/LimaCommon.h" + +/* FWI 22/02/2016 déplacé dans le .h +#ifdef WIN32 + +#ifdef LIMA_COMMON_EXPORTING +#define LIMA_COMMON_EXPORT __declspec(dllexport) +#else +#define LIMA_COMMON_EXPORT __declspec(dllimport) +#endif + + +#else // Not WIN32 + +#define LIMA_COMMON_EXPORT + +#endif +*/ + +#include +#include + +#ifdef ANTINNO_SPECIFIC +namespace Lima +{ +#ifdef _DEBUG +StopAnalyze::StopAnalyze(bool v) : _v(v) +{ +} +StopAnalyze::StopAnalyze(StopAnalyze const& o) : _v(o._v) +{ +} +StopAnalyze::operator bool() const +{ + return _v; +} +StopAnalyze& StopAnalyze::operator=(StopAnalyze const& o) +{ + _v = o._v; + return *this; +} +bool StopAnalyze::operator==(StopAnalyze const& o) +{ + return _v == o._v; +} +bool StopAnalyze::operator!=(StopAnalyze const& o) +{ + return _v != o._v; +} +#else +// nothing +#endif + +StopAnalyze defaultStopAnalyze(false); + +} +#endif + +namespace Lima +{ +namespace Common +{ + +LIMA_COMMON_EXPORT void fakeSymbolFoWindowsLinking() {} + +} +} diff --git a/lima_common/src/common/LimaCommon.h b/lima_common/src/common/LimaCommon.h index a22616c86..b3b0ee0b2 100644 --- a/lima_common/src/common/LimaCommon.h +++ b/lima_common/src/common/LimaCommon.h @@ -1,365 +1,486 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/************************************************************************ - * - * @file LimaCommon.h (from s2Common.h) - * @author Gael de Chalendar - - * Benoit Mathieu - - * Hervé Le Borgne - - * @date mar déc 18 2007 - * copyright Copyright (C) 2003-2012 by CEA LIST - * Project mm_common - * - * @brief (short description) - * - ***********************************************************************/ -#ifndef LIMA_MMCOMMONS_H -#define LIMA_MMCOMMONS_H - -#include -#include - -#ifdef WIN32 - -#pragma warning( disable : 4512 ) - -// Avoids compilation errors redefining struc sockaddr in ws2def.h -#define _WINSOCKAPI_ - -#undef min -#undef max -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#endif - - -#include - -#ifdef WIN32 - - -#ifdef LIMA_DATA_EXPORTING - #define LIMA_DATA_EXPORT __declspec(dllexport) -#else - #define LIMA_DATA_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_DATAHANDLER_EXPORTING - #define LIMA_DATAHANDLER_EXPORT __declspec(dllexport) -#else - #define LIMA_DATAHANDLER_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_FSAACCESS_EXPORTING - #define LIMA_FSAACCESS_EXPORT __declspec(dllexport) -#else - #define LIMA_FSAACCESS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_MEDIAPROCESSORS_EXPORTING - #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllexport) -#else - #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_MEDIATICDATA_EXPORTING - #define LIMA_MEDIATICDATA_EXPORT __declspec(dllexport) -#else - #define LIMA_MEDIATICDATA_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_COMMONMISC_EXPORTING - #define LIMA_COMMONMISC_EXPORT __declspec(dllexport) -#else - #define LIMA_COMMONMISC_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_COMMONTOOLS_EXPORTING - #define LIMA_COMMONTOOLS_EXPORT __declspec(dllexport) -#else - #define LIMA_COMMONTOOLS_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_PROCESSUNITFRAMEWORK_EXPORTING - #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllexport) -#else - #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_TIME_EXPORTING - #define LIMA_TIME_EXPORT __declspec(dllexport) -#else - #define LIMA_TIME_EXPORT __declspec(dllimport) -#endif - -#ifdef LIMA_XMLCONFIGURATIONFILES_EXPORTING - #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllexport) -#else - #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllimport) -#endif - -#else // Not WIN32 - -#define LIMA_DATA_EXPORT -#define LIMA_DATAHANDLER_EXPORT -#define LIMA_FSAACCESS_EXPORT -#define LIMA_MEDIAPROCESSORS_EXPORT -#define LIMA_MEDIATICDATA_EXPORT -#define LIMA_COMMONMISC_EXPORT -#define LIMA_COMMONTOOLS_EXPORT -#define LIMA_PROCESSUNITFRAMEWORK_EXPORT -#define LIMA_TIME_EXPORT -#define LIMA_XMLCONFIGURATIONFILES_EXPORT - -#endif - -#include -#include - -#ifndef LIMA_DEBUG -#define LIMA_DEBUG 0 -#endif - -// standard include -#include - -#include -#include -#include "common/QsLog/QsLogDest.h" - -#define LTRACE QLOG_TRACE() -#define LDEBUG QLOG_DEBUG() -#define LINFO QLOG_INFO() -#define LNOTICE QLOG_INFO() -#define LWARN QLOG_WARN() -#define LERROR QLOG_ERROR() -#define LFATAL QLOG_FATAL() - -// #define LOGINIT(X) QsLogging::Logger& logger = QsLogging::Logger::instance(X); -// logger.setLoggingLevel( QsLogging::Categories::instance().levelFor( X ) ); - -class LogInit +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file LimaCommon.h (from s2Common.h) + * @author Gael de Chalendar + + * Benoit Mathieu + + * Hervé Le Borgne + + * @date mar déc 18 2007 + * copyright Copyright (C) 2003-2012 by CEA LIST + * Project mm_common + * + * @brief (short description) + * + ***********************************************************************/ +#ifndef LIMA_MMCOMMONS_H +#define LIMA_MMCOMMONS_H + +#include +#include + +#ifdef WIN32 + +#pragma warning( disable : 4512 ) + +// Avoids compilation errors redefining struc sockaddr in ws2def.h +#define _WINSOCKAPI_ + +#undef min +#undef max +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#endif + + +#include + +#ifdef WIN32 + + +#ifdef LIMA_DATA_EXPORTING + #define LIMA_DATA_EXPORT __declspec(dllexport) +#else + #define LIMA_DATA_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_DATAHANDLER_EXPORTING + #define LIMA_DATAHANDLER_EXPORT __declspec(dllexport) +#else + #define LIMA_DATAHANDLER_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_FSAACCESS_EXPORTING + #define LIMA_FSAACCESS_EXPORT __declspec(dllexport) +#else + #define LIMA_FSAACCESS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_MEDIAPROCESSORS_EXPORTING + #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllexport) +#else + #define LIMA_MEDIAPROCESSORS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_MEDIATICDATA_EXPORTING + #define LIMA_MEDIATICDATA_EXPORT __declspec(dllexport) +#else + #define LIMA_MEDIATICDATA_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_COMMONMISC_EXPORTING + #define LIMA_COMMONMISC_EXPORT __declspec(dllexport) +#else + #define LIMA_COMMONMISC_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_COMMONTOOLS_EXPORTING + #define LIMA_COMMONTOOLS_EXPORT __declspec(dllexport) +#else + #define LIMA_COMMONTOOLS_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_PROCESSUNITFRAMEWORK_EXPORTING + #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllexport) +#else + #define LIMA_PROCESSUNITFRAMEWORK_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_TIME_EXPORTING + #define LIMA_TIME_EXPORT __declspec(dllexport) +#else + #define LIMA_TIME_EXPORT __declspec(dllimport) +#endif + +#ifdef LIMA_XMLCONFIGURATIONFILES_EXPORTING + #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllexport) +#else + #define LIMA_XMLCONFIGURATIONFILES_EXPORT __declspec(dllimport) +#endif + +#else // Not WIN32 + +#define LIMA_DATA_EXPORT +#define LIMA_DATAHANDLER_EXPORT +#define LIMA_FSAACCESS_EXPORT +#define LIMA_MEDIAPROCESSORS_EXPORT +#define LIMA_MEDIATICDATA_EXPORT +#define LIMA_COMMONMISC_EXPORT +#define LIMA_COMMONTOOLS_EXPORT +#define LIMA_PROCESSUNITFRAMEWORK_EXPORT +#define LIMA_TIME_EXPORT +#define LIMA_XMLCONFIGURATIONFILES_EXPORT + +#endif + +#include +#include + +#ifndef LIMA_DEBUG +#define LIMA_DEBUG 0 +#endif + +// standard include +#include + +#include +#include +#include "common/QsLog/QsLogDest.h" +#ifdef ANTINNO_SPECIFIC +// FWI 19/05/2016 ajout 2 includes +#include +#include + +#ifdef WIN32 + +#ifdef LIMA_COMMON_EXPORTING +#define LIMA_COMMON_EXPORT __declspec(dllexport) +#else +#define LIMA_COMMON_EXPORT __declspec(dllimport) +#endif + + +#else // Not WIN32 + +#define LIMA_COMMON_EXPORT + +#endif +namespace Lima +{ +#ifdef _DEBUG + class LIMA_COMMON_EXPORT StopAnalyze + { + bool _v; + public: + StopAnalyze(bool v); + StopAnalyze(StopAnalyze const&); + operator bool() const; + StopAnalyze& operator=(StopAnalyze const& o); + bool operator==(StopAnalyze const& o); + bool operator!=(StopAnalyze const& o); + }; +#else + typedef bool LIMA_COMMON_EXPORT StopAnalyze; +#endif + extern LIMA_COMMON_EXPORT StopAnalyze defaultStopAnalyze; +} + +#define LTRACE \ + if ( logger.loggingLevel() <= QsLogging::TraceLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::TraceLevel, logger.zone()).stream() +#define LDEBUG \ + if ( logger.loggingLevel() <= QsLogging::DebugLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::DebugLevel, logger.zone()).stream() +#define LINFO \ + if ( logger.loggingLevel() <= QsLogging::InfoLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::InfoLevel, logger.zone()).stream() +#define LNOTICE \ + if ( logger.loggingLevel() <= QsLogging::InfoLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::InfoLevel, logger.zone()).stream() +#define LWARN \ + if ( logger.loggingLevel() <= QsLogging::WarnLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::WarnLevel, logger.zone()).stream() +#define LERROR \ + if ( logger.loggingLevel() <= QsLogging::ErrorLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::ErrorLevel, logger.zone()).stream() +#define LFATAL \ + if ( logger.loggingLevel() <= QsLogging::FatalLevel ) \ + QsLogging::antinno::LogHelper(QsLogging::FatalLevel, logger.zone()).stream() + +#else + +#define LTRACE QLOG_TRACE() +#define LDEBUG QLOG_DEBUG() +#define LINFO QLOG_INFO() +#define LNOTICE QLOG_INFO() +#define LWARN QLOG_WARN() +#define LERROR QLOG_ERROR() +#define LFATAL QLOG_FATAL() + +#endif + +// #define LOGINIT(X) QsLogging::Logger& logger = QsLogging::Logger::instance(X); +// logger.setLoggingLevel( QsLogging::Categories::instance().levelFor( X ) ); + +class LogInit +{ +public: + LogInit(char const* x) + { + // initialisation thread-safe + static QMutex mutex; + QMutexLocker locker(&mutex); + pLogger = &QsLogging::Logger::instance(x); +#ifndef DEBUG_CD + QsLogging::Level level = QsLogging::Categories::instance().levelFor(x); + pLogger->setLoggingLevel(level); +#endif + + } + QsLogging::Logger* pLogger; +}; +#ifndef DEBUG_CD +#define LOGINIT(X) \ + static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ + auto& logger = *(logInit.pLogger); +#else +#define LOGINIT(X) \ + static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ + auto& logger = *(logInit.pLogger); \ + logger.setLoggingLevel(QsLogging::Categories::instance().levelFor( X )); +#endif + +//QsLogging::DestinationPtr debugDestination( QsLogging::DestinationFactory::MakeDebugOutputDestination() ); +//logger.addDestination(debugDestination.get()); +#ifdef ANTINNO_SPECIFIC +// FWI 07/10/2015 ajout pour les logger +static std::ostream& operator<<(std::ostream &os, const QString& s) { -public: - LogInit(char const* x) - { - // initialisation thread-safe - static QMutex mutex; - QMutexLocker locker(&mutex); - pLogger = &QsLogging::Logger::instance(x); - QsLogging::Level level = QsLogging::Categories::instance().levelFor(x); - pLogger->setLoggingLevel(level); - } - QsLogging::Logger* pLogger; -}; -#ifndef DEBUG_CD -#define LOGINIT(X) \ - static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ - auto& logger = *(logInit.pLogger); -#else -#define LOGINIT(X) \ - static LogInit logInit(X); /*initialisation exécutée une seul fois*/\ - auto& logger = *(logInit.pLogger); \ - logger.setLoggingLevel(QsLogging::Categories::instance().levelFor( X )); -#endif - -//QsLogging::DestinationPtr debugDestination( QsLogging::DestinationFactory::MakeDebugOutputDestination() ); -//logger.addDestination(debugDestination.get()); - - -#define LENDL ". Note: LENDL is deprecated. It will be removed from a future release." - -#define ABSTRACTFACTORYPATTERNLOGINIT LOGINIT("Common::AbstractFactoryPattern") -#define ABSTRACTPROCESSINGCLIENTLOGINIT LOGINIT("Common::AbstractProcessingClient") -#define AGLOGINIT LOGINIT("Common::AnnotationGraph") -#define BOWLOGINIT LOGINIT("Common::BOW"); -#define CLIENTFACTORYLOGINIT LOGINIT("Common::ClientFactory") -#define COMPSTRACCESSLOGINIT LOGINIT("Common::CompStrAccess") -#define FSAAHASHLOGINIT LOGINIT("Common::FsaAccessHash") -#define FSAAIOLOGINIT LOGINIT("Common::FsaAccessIO") -#define FSAALOGINIT LOGINIT("Common::FsaAccess") -#define HANDLERLOGINIT LOGINIT("Common::Handler") -#define LDATALOGINIT LOGINIT("Common::LanguageData") -#define MDATALOGINIT LOGINIT("Common::MediaticData") -#define MISCLOGINIT LOGINIT("Common::Misc") -#define PROCESSORSLOGINIT LOGINIT("Common::Processors") -#define PROCESSUNITFRAMEWORKLOGINIT LOGINIT("Common::ProcessUnitFramework") -#define PROPERTYCODELOGINIT LOGINIT("Common::PropertyCode") -#define STRINGMAPLOGINIT LOGINIT("Common::StringMap") -#define STRPOOLLOGINIT LOGINIT("Common::StringPool") -#define TGVLOGINIT LOGINIT("Common::TGV") -#define XMLCFGLOGINIT LOGINIT("Common::XMLConfigurationFiles") -#define DYNAMICLIBMANAGERLOGINIT LOGINIT("Common::DynamicLibrariesManager") - -QDebug& operator<< (QDebug& qd, const std::string& str ); - - -#ifndef LIMA_UNUSED -#define LIMA_UNUSED(x) (void)x; -#endif - -namespace Lima -{ - -enum LimaStatusCode { - SUCCESS_ID, - CANNOT_OPEN_FILE_ERROR, - OUT_OF_RANGE_ERROR, - UNKNOWN_ERROR, - UNSUPPORTED_LANGUAGE, - INVALID_CONFIGURATION, - MISSING_DATA -}; - -BOOST_STRONG_TYPEDEF(uint32_t, LinguisticCode); -BOOST_STRONG_TYPEDEF(char, NoParameters); - -#define UNDEFLANG std::numeric_limits::max() - -BOOST_STRONG_TYPEDEF(uint8_t, MediaId); - -class LimaException : public std::exception -{ -public: - LimaException() : std::exception(),m_reason() {} - LimaException(const std::string& mess) : std::exception(), m_reason(mess) {} - virtual ~LimaException() throw() {} - virtual const char * what () const throw() { - return m_reason.c_str(); - } -protected: - LimaException& operator=(const LimaException&) {return *this;} - const std::string m_reason; -}; - -class InvalidConfiguration : public LimaException -{ - public: - InvalidConfiguration() : LimaException() {}; - InvalidConfiguration(const std::string& mess) : LimaException(mess) {} -private: - InvalidConfiguration& operator=(const InvalidConfiguration&) {return *this;} -}; -class MediaNotInitialized : public LimaException -{ -public : - MediaNotInitialized(MediaId medId) : LimaException(),m_medId(medId),m_med(),m_num(true) {}; - MediaNotInitialized(const std::string& med) : LimaException(),m_medId(0),m_med(med),m_num(false) {}; - virtual ~MediaNotInitialized() throw() {}; - const char* what() const throw() - { - if (m_num) - { - std::ostringstream oo; - oo << "uninitialized media " << (int)m_medId; - return oo.str().c_str(); - } - else - { - return (std::string("uninitialized media ")+m_med).c_str(); - } - }; -private: - MediaNotInitialized& operator=(const MediaNotInitialized&) {return *this;} - MediaId m_medId; - std::string m_med; - bool m_num; -}; - -class LanguageNotInitialized : public LimaException { -public : - LanguageNotInitialized(MediaId langId) : LimaException(),m_langId(langId),m_lang(),m_num(true) {}; - LanguageNotInitialized(const std::string& lang) : LimaException(),m_langId(0),m_lang(lang),m_num(false) {}; - virtual ~LanguageNotInitialized() throw() {}; - const char* what() const throw() { - if (m_num) { - std::ostringstream oo; - oo << "uninitialized language " << (int)m_langId; - return oo.str().c_str(); - } else { - return (std::string("uninitialized language ")+m_lang).c_str(); - } - }; -private: - LanguageNotInitialized& operator=(const LanguageNotInitialized&) {return *this;} - MediaId m_langId; - std::string m_lang; - bool m_num; -}; - -class AccessByStringNotInitialized : public LimaException { -public : - AccessByStringNotInitialized(const std::string& reason) : LimaException(), m_reason(reason) {}; - virtual ~AccessByStringNotInitialized() throw() {}; - const char* what() const throw() { - std::ostringstream oo; - oo << "Fsa not initialized because of " << m_reason; - return oo.str().c_str(); - }; -private: - AccessByStringNotInitialized& operator=(const AccessByStringNotInitialized&) {return *this;} - std::string m_reason; -}; - -class AccessByStringOutOfRange : public LimaException { -public : - AccessByStringOutOfRange(const std::string& reason) : LimaException(), m_reason(reason) {}; - virtual ~AccessByStringOutOfRange() throw() {}; - const char* what() const throw() { - std::ostringstream oo; - oo << "parameter out of range " << m_reason; - return oo.str().c_str(); - }; -private: - AccessByStringOutOfRange& operator=(const AccessByStringOutOfRange&) {return *this;} - std::string m_reason; -}; - -class IncompleteResources : public LimaException { -public : - IncompleteResources(const std::string& reason) : LimaException(), m_reason(reason) {} - virtual ~IncompleteResources() throw() {} - const char* what() const throw() { - return (std::string("incomplete ressources: ") + m_reason).c_str() ; - } -private: - IncompleteResources& operator=(const IncompleteResources&) {return *this;} - std::string m_reason; -}; - -class XMLException : public std::runtime_error -{ -public: - explicit XMLException(const std::string& msg = "") : std::runtime_error(msg) {} - const char* getMessage() const {return this->what();} -private: - XMLException& operator=(const XMLException&) {return *this;} -}; - - -} // closing namespace Lima - -#endif // LIMA_MMCOMMONS_H + os << s.toUtf8().constData(); + return os; +} + +static ::std::ostream& operator<<(::std::ostream& out, QStringList const& o) +{ + bool isFirst = true; + for(auto it=o.constBegin(); it!=o.constEnd(); ++it) + { + out << (isFirst?L"":L",") << *it; + isFirst = false; + } + return out; +} +#endif + + + +#define LENDL ". Note: LENDL is deprecated. It will be removed from a future release." + +#define ABSTRACTFACTORYPATTERNLOGINIT LOGINIT("Common::AbstractFactoryPattern") +#define ABSTRACTPROCESSINGCLIENTLOGINIT LOGINIT("Common::AbstractProcessingClient") +#define AGLOGINIT LOGINIT("Common::AnnotationGraph") +#define BOWLOGINIT LOGINIT("Common::BOW"); +#define CLIENTFACTORYLOGINIT LOGINIT("Common::ClientFactory") +#define COMPSTRACCESSLOGINIT LOGINIT("Common::CompStrAccess") +#define FSAAHASHLOGINIT LOGINIT("Common::FsaAccessHash") +#define FSAAIOLOGINIT LOGINIT("Common::FsaAccessIO") +#define FSAALOGINIT LOGINIT("Common::FsaAccess") +#define HANDLERLOGINIT LOGINIT("Common::Handler") +#define LDATALOGINIT LOGINIT("Common::LanguageData") +#define MDATALOGINIT LOGINIT("Common::MediaticData") +#define MISCLOGINIT LOGINIT("Common::Misc") +#define PROCESSORSLOGINIT LOGINIT("Common::Processors") +#define PROCESSUNITFRAMEWORKLOGINIT LOGINIT("Common::ProcessUnitFramework") +#define PROPERTYCODELOGINIT LOGINIT("Common::PropertyCode") +#define STRINGMAPLOGINIT LOGINIT("Common::StringMap") +#define STRPOOLLOGINIT LOGINIT("Common::StringPool") +#define TGVLOGINIT LOGINIT("Common::TGV") +#define XMLCFGLOGINIT LOGINIT("Common::XMLConfigurationFiles") +#define DYNAMICLIBMANAGERLOGINIT LOGINIT("Common::DynamicLibrariesManager") + +QDebug& operator<< (QDebug& qd, const std::string& str ); + + +#ifndef LIMA_UNUSED +#define LIMA_UNUSED(x) (void)x; +#endif + +namespace Lima +{ + +enum LimaStatusCode { + SUCCESS_ID, + CANNOT_OPEN_FILE_ERROR, + OUT_OF_RANGE_ERROR, + UNKNOWN_ERROR, + UNSUPPORTED_LANGUAGE, + INVALID_CONFIGURATION, + MISSING_DATA +#ifdef ANTINNO_SPECIFIC + // FWI 22/02/2016 ajout TIME_OVERFLOW pour stopAnalyze + ,TIME_OVERFLOW +#endif +}; + +#ifdef ANTINNO_SPECIFIC +BOOST_STRONG_TYPEDEF(unsigned int, ReformulationType) +#endif + + +BOOST_STRONG_TYPEDEF(uint32_t, LinguisticCode); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type LinguisticCode sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::LinguisticCode numeric_limits::max() { return Lima::LinguisticCode(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +BOOST_STRONG_TYPEDEF(char, NoParameters); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type LinguisticCode sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::NoParameters numeric_limits::max() { return Lima::NoParameters(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +#define UNDEFLANG std::numeric_limits::max() + +BOOST_STRONG_TYPEDEF(uint8_t, MediaId); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type MediaId sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +} +namespace std { + template <> Lima::MediaId numeric_limits::max() { return Lima::MediaId(::std::numeric_limits::max()); } +} +namespace Lima { +#endif + +class LimaException : public std::exception +{ +public: + LimaException() : std::exception(),m_reason() {} + LimaException(const std::string& mess) : std::exception(), m_reason(mess) {} + virtual ~LimaException() throw() {} + virtual const char * what () const throw() { + return m_reason.c_str(); + } +protected: + LimaException& operator=(const LimaException&) {return *this;} + const std::string m_reason; +}; + +class InvalidConfiguration : public LimaException +{ + public: + InvalidConfiguration() : LimaException() {}; + InvalidConfiguration(const std::string& mess) : LimaException(mess) {} +private: + InvalidConfiguration& operator=(const InvalidConfiguration&) {return *this;} +}; +class MediaNotInitialized : public LimaException +{ +public : + MediaNotInitialized(MediaId medId) : LimaException(),m_medId(medId),m_med(),m_num(true) {}; + MediaNotInitialized(const std::string& med) : LimaException(),m_medId(0),m_med(med),m_num(false) {}; + virtual ~MediaNotInitialized() throw() {}; + const char* what() const throw() + { + if (m_num) + { + std::ostringstream oo; + oo << "uninitialized media " << (int)m_medId; + return oo.str().c_str(); + } + else + { + return (std::string("uninitialized media ")+m_med).c_str(); + } + }; +private: + MediaNotInitialized& operator=(const MediaNotInitialized&) {return *this;} + MediaId m_medId; + std::string m_med; + bool m_num; +}; + +class LanguageNotInitialized : public LimaException { +public : + LanguageNotInitialized(MediaId langId) : LimaException(),m_langId(langId),m_lang(),m_num(true) {}; + LanguageNotInitialized(const std::string& lang) : LimaException(),m_langId(0),m_lang(lang),m_num(false) {}; + virtual ~LanguageNotInitialized() throw() {}; + const char* what() const throw() { + if (m_num) { + std::ostringstream oo; + oo << "uninitialized language " << (int)m_langId; + return oo.str().c_str(); + } else { + return (std::string("uninitialized language ")+m_lang).c_str(); + } + }; +private: + LanguageNotInitialized& operator=(const LanguageNotInitialized&) {return *this;} + MediaId m_langId; + std::string m_lang; + bool m_num; +}; + +class AccessByStringNotInitialized : public LimaException { +public : + AccessByStringNotInitialized(const std::string& reason) : LimaException(), m_reason(reason) {}; + virtual ~AccessByStringNotInitialized() throw() {}; + const char* what() const throw() { + std::ostringstream oo; + oo << "Fsa not initialized because of " << m_reason; + return oo.str().c_str(); + }; +private: + AccessByStringNotInitialized& operator=(const AccessByStringNotInitialized&) {return *this;} + std::string m_reason; +}; + +class AccessByStringOutOfRange : public LimaException { +public : + AccessByStringOutOfRange(const std::string& reason) : LimaException(), m_reason(reason) {}; + virtual ~AccessByStringOutOfRange() throw() {}; + const char* what() const throw() { + std::ostringstream oo; + oo << "parameter out of range " << m_reason; + return oo.str().c_str(); + }; +private: + AccessByStringOutOfRange& operator=(const AccessByStringOutOfRange&) {return *this;} + std::string m_reason; +}; + +class IncompleteResources : public LimaException { +public : + IncompleteResources(const std::string& reason) : LimaException(), m_reason(reason) {} + virtual ~IncompleteResources() throw() {} + const char* what() const throw() { + return (std::string("incomplete ressources: ") + m_reason).c_str() ; + } +private: + IncompleteResources& operator=(const IncompleteResources&) {return *this;} + std::string m_reason; +}; + +class XMLException : public std::runtime_error +{ +public: + explicit XMLException(const std::string& msg = "") : std::runtime_error(msg) {} + const char* getMessage() const {return this->what();} +private: + XMLException& operator=(const XMLException&) {return *this;} +}; + + +} // closing namespace Lima + +#endif // LIMA_MMCOMMONS_H diff --git a/lima_common/src/common/MediaProcessors/MediaProcessors.cpp b/lima_common/src/common/MediaProcessors/MediaProcessors.cpp index e5fcfd051..11ddb9c34 100644 --- a/lima_common/src/common/MediaProcessors/MediaProcessors.cpp +++ b/lima_common/src/common/MediaProcessors/MediaProcessors.cpp @@ -29,6 +29,7 @@ #include "common/XMLConfigurationFiles/moduleConfigurationStructure.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include @@ -70,18 +71,10 @@ MediaProcessors::MediaProcessors(const MediaProcessors& mp) : Singleton::iterator it=m_d->m_pipelineManagers.begin(); it!=m_d->m_pipelineManagers.end(); it++ ) { -#ifdef DEBUG_CD - LDEBUG << "delete " << it->first; -#endif delete it->second; it->second=0; } @@ -180,6 +173,10 @@ void MediaProcessors::initPipelines ( { std::cout << "no pipeline '" << *pipItr << "' for media " << mediaStr << std::endl; // continue; +#ifdef ANTINNO_BUGFIX + // FWI 26/04/2016 : activation du "continue" sinon entryItr->second provoque une erreur détectée seulement en mode debug + continue; +#endif } const MediaProcessUnit* pu=mapItr->second->getObject ( entryItr->second ); const MediaProcessUnitPipeline* pipeline=static_cast ( pu ); @@ -252,8 +249,7 @@ includeProcessors(Common::XMLConfigurationFiles::ModuleConfigurationStructure& m try { //PROCESSORSLOGINIT; //LDEBUG << "i="<< i; - fileName=Common::MediaticData::MediaticData::single().getConfigPath()+ - "/"+string((*it),0,i); + fileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),string((*it),0,i).c_str()).toUtf8().constData(); //LDEBUG << "filename="<< fileName; moduleName=string((*it),i+1); //LDEBUG << "moduleName="<< moduleName; diff --git a/lima_common/src/common/MediaticData/EntityType.h b/lima_common/src/common/MediaticData/EntityType.h index 8e678be90..94e462ebb 100644 --- a/lima_common/src/common/MediaticData/EntityType.h +++ b/lima_common/src/common/MediaticData/EntityType.h @@ -36,7 +36,7 @@ #include -#include +#include namespace Lima { namespace Common { diff --git a/lima_common/src/common/MediaticData/mediaData.cpp b/lima_common/src/common/MediaticData/mediaData.cpp index 7880859a2..7dad1a72a 100644 --- a/lima_common/src/common/MediaticData/mediaData.cpp +++ b/lima_common/src/common/MediaticData/mediaData.cpp @@ -53,10 +53,10 @@ class MediaDataPrivate //std::list< LinguisticCode > m_sentenceBreakMicros; }; -MediaData::MediaData() : m_d(new MediaDataPrivate()) +MediaData::MediaData() : InitializableObject(), m_d(new MediaDataPrivate()) {} -MediaData::MediaData(const MediaData& md) : m_d(new MediaDataPrivate(*md.m_d)) +MediaData::MediaData(const MediaData& md) : InitializableObject(md), m_d(new MediaDataPrivate(*md.m_d)) {} MediaData::~MediaData() diff --git a/lima_common/src/common/MediaticData/mediaData.h b/lima_common/src/common/MediaticData/mediaData.h index d7dfa2c3c..f796f7933 100644 --- a/lima_common/src/common/MediaticData/mediaData.h +++ b/lima_common/src/common/MediaticData/mediaData.h @@ -31,7 +31,7 @@ #include //uint32_t #endif -#include +#include #include namespace Lima @@ -47,6 +47,15 @@ namespace MediaticData #define MEDIADATA_CLASSID "MediaData" BOOST_STRONG_TYPEDEF(boost::uint32_t, ConceptType); +#ifdef ANTINNO_SPECIFIC +// FWI 25/05/2016 : on spécialise max() pour le type ConceptType sinon le max() d'origine renvoit 0 (ce qui est un bug de la lib std) +}}} +namespace std { + template <> Lima::Common::MediaticData::ConceptType numeric_limits::max() + { return Lima::Common::MediaticData::ConceptType(::std::numeric_limits::max()); } +} +namespace Lima { namespace Common { namespace MediaticData{ +#endif class MediaDataPrivate; /** diff --git a/lima_common/src/common/MediaticData/mediaticData.cpp b/lima_common/src/common/MediaticData/mediaticData.cpp index 1880c53ae..235dc7ce4 100644 --- a/lima_common/src/common/MediaticData/mediaticData.cpp +++ b/lima_common/src/common/MediaticData/mediaticData.cpp @@ -32,6 +32,7 @@ #include "common/LimaCommon.h" #include "common/QsLog/QsLog.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/Data/readwritetools.h" #include "common/misc/DoubleAccessObjectToIdMap.h" //#include "common/misc/strwstrtools.h" @@ -51,6 +52,7 @@ #include #include #include +#include using namespace std; @@ -98,7 +100,7 @@ class MediaticDataPrivate std::map< std::string, MediaId > m_mediasIds; std::map< MediaId, std::string > m_mediasSymbol; - std::map< MediaId, std::string > m_mediaDefinitionFiles; + std::map< MediaId, QString > m_mediaDefinitionFiles; std::map< MediaId, MediaData* > m_mediasData; // entity types @@ -203,7 +205,7 @@ void MediaticData::init( // TimeUtils::updateCurrentTime(); MDATALOGINIT; - LINFO << "MediaticData::init " << resourcesPath.c_str() << " " << configPath.c_str() << " " << configFile.c_str(); + LINFO << "MediaticData::init " << resourcesPath << " " << configPath << " " << configFile; //LINFO << "Mediatic data initialization"; m_d->m_resourcesPath=resourcesPath; @@ -211,38 +213,55 @@ void MediaticData::init( m_d->m_configFile=configFile; //LINFO << "initialize XMLParser"; - initXMLParser(); - //LINFO << "parse configuration file: " << configPath << "/" << configFile; - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configPath + "/" + configFile); - - LINFO << "MediaticData::init for "; - for (std::deque< std::string >::const_iterator it = meds.begin(); it != meds.end(); it++) - LINFO << " " << (*it).c_str(); - - // initHomoSyntagmaticChainsAndRelationsTypes(*configParser); - LDEBUG << "initialize global parameters"; - m_d->initReleaseStringsPool(configuration); + QStringList configPaths = QString::fromUtf8(configPath.c_str()).split(LIMA_PATH_SEPARATOR); + QStringList configFiles = QString::fromUtf8(configFile.c_str()).split(LIMA_PATH_SEPARATOR); + bool configurationFileFound = false; + Q_FOREACH(QString confPath, configPaths) + { + Q_FOREACH(QString confFile, configFiles) + { + if (QFileInfo(confPath + "/" + confFile).exists()) + { + LDEBUG << "MediaticData::init parse configuration file: " << (confPath + "/" + confFile); + configurationFileFound = true; + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration((confPath + "/" + confFile).toUtf8().constData()); - initEntityTypes(configuration); + // initHomoSyntagmaticChainsAndRelationsTypes(*configParser); + LDEBUG << "MediaticData::init initialize global parameters"; + m_d->initReleaseStringsPool(configuration); - m_d->initRelations(configuration); - - m_d->initConceptTypes(configuration); - - /** - * initialize active medias - */ + initEntityTypes(configuration); - m_d->initMedias(configuration, meds); - - m_d->m_mediasData.clear(); - for (map::const_iterator it=m_d->m_mediasIds.begin(); - it!=m_d->m_mediasIds.end(); - it++) + m_d->initRelations(configuration); + + m_d->initConceptTypes(configuration); + + /** + * initialize active medias + */ + LINFO << "!!! MediaticData::init for "; + for (std::deque< std::string >::const_iterator it = meds.begin(); it != meds.end(); it++) + LINFO << " " << (*it).c_str(); + + m_d->initMedias(configuration, meds); + + m_d->m_mediasData.clear(); + for (map::const_iterator it=m_d->m_mediasIds.begin(); + it!=m_d->m_mediasIds.end(); + it++) + { + initMediaData(it->second); + } + } + if (configurationFileFound) break; + } + if (configurationFileFound) break; + } + if (!configurationFileFound) { - initMediaData(it->second); + MDATALOGINIT; + LERROR << "No configuration file has been found with" << configPath << "and" << configFile; } - //LINFO << "Mediatic data initialization finished"; // TimeUtils::logElapsedTime("MediaticDataInit"); } @@ -265,7 +284,7 @@ void MediaticData::initMedia(const std::string& media) LINFO << "MediaticData::initMedia" << media; //LINFO << "parse configuration file: " << configPath << "/" << configFile; - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(m_d->m_configPath + "/" + m_d->m_configFile); + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(Common::Misc::findFileInPaths(m_d->m_configPath.c_str(), m_d->m_configFile.c_str()).toUtf8().constData()); Lima::Common::MediaticData::MediaticData::changeable().initEntityTypes(configuration); std::deque< std::string > meds; @@ -349,13 +368,6 @@ MediaData& MediaticData::mediaData(MediaId media) return *(it->second); } -void MediaticData::initXMLParser() -{ -// MDATALOGINIT; - //LINFO << "XMLParser initialization"; - -} - void MediaticDataPrivate::initMedias( XMLConfigurationFileParser& configParser, const std::deque< std::string >& meds) @@ -402,9 +414,32 @@ void MediaticDataPrivate::initMedias( m_mediasIds[*it]=id; m_mediasSymbol[id]=*it; - string deffile=configParser.getModuleGroupParamValue("common","mediaDefinitionFiles",*it); - m_mediaDefinitionFiles[id]= m_configPath+"/"+deffile; - + QString deffile= QString::fromUtf8(configParser.getModuleGroupParamValue("common","mediaDefinitionFiles",*it).c_str()); + QStringList configPaths = QString::fromUtf8(m_configPath.c_str()).split(LIMA_PATH_SEPARATOR); + bool mediaDefinitionFileFound = false; +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(const QString& confPath, configPaths) +#else + for(const QString& confPath: configPaths) +#endif + { + if (QFileInfo(confPath + "/" + deffile).exists()) + { + m_mediaDefinitionFiles[id] = (confPath+"/"+deffile); +#ifdef DEBUG_CD + LDEBUG << "media definition file for id" << id << "is" << m_mediaDefinitionFiles[id]; +#endif + mediaDefinitionFileFound = true; + break; + } + } + if (!mediaDefinitionFileFound) + { + MDATALOGINIT; + LERROR << "No media definition file'"<::const_iterator it=m_d->m_mediaDefinitionFiles.find(med); + auto it=m_d->m_mediaDefinitionFiles.find(med); if (it==m_d->m_mediaDefinitionFiles.end()) { MDATALOGINIT; @@ -431,9 +466,9 @@ void MediaticData::initMediaData(MediaId med) throw InvalidConfiguration(); } #ifdef DEBUG_CD - LDEBUG << "MediaticData::initMediaData Parse MediaConfigurationFile " << (it->second).c_str(); + LDEBUG << "MediaticData::initMediaData Parse MediaConfigurationFile " << (it->second); #endif - XMLConfigurationFileParser parser(it->second); + XMLConfigurationFileParser parser((it->second).toUtf8().constData()); #ifdef DEBUG_CD LDEBUG << "MediaticData::initMediaData Class: " << parser.getModuleGroupParamValue("MediaData","Class","class").c_str(); @@ -486,8 +521,8 @@ void MediaticDataPrivate::initRelations( { #ifdef DEBUG_CD MDATALOGINIT; + LDEBUG << "MediaticDataPrivate::initRelations"; #endif - //LINFO << "intialize Relations"; m_relTypes[s_undefinedRelation]=0; m_relTypesNum[0]=s_undefinedRelation; @@ -496,14 +531,14 @@ void MediaticDataPrivate::initRelations( for (map::const_iterator it=rels.begin(); it!=rels.end(); it++) - { - uint8_t relId=atoi(it->second.c_str()); + { + uint8_t relId=atoi(it->second.c_str()); #ifdef DEBUG_CD - LDEBUG << "read relation " << it->first.c_str() << " -> " << (int)relId; + LDEBUG << "read relation " << it->first.c_str() << " -> " << (int)relId; #endif - m_relTypes[it->first]=relId; - m_relTypesNum[relId]=it->first; - } + m_relTypes[it->first]=relId; + m_relTypesNum[relId]=it->first; + } } catch (NoSuchGroup& ) { MDATALOGINIT; @@ -521,8 +556,8 @@ void MediaticDataPrivate::initConceptTypes( { #ifdef DEBUG_CD MDATALOGINIT; + LDEBUG << "MediaticDataPrivate::initConceptTypes"; #endif - //LINFO << "intialize Concepts Types"; try { const map& mapping=configParser.getModuleConfiguration("common").getGroupNamed("SemanticData").getMapAtKey("conceptTypes"); @@ -653,10 +688,10 @@ void MediaticData::initEntityTypes(XMLConfigurationFileParser& configParser) LimaString groupName=Common::Misc::utf8stdstring2limastring((*it).first); - if (groupName=="include") { + if (groupName=="include") + { deque includeList=moduleConf.getListValuesAtKeyOfGroupNamed("includeList","include"); string::size_type i; - string fileName(""); string moduleName(""); for (std::size_t k=0; km_configPath.c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + if (QFileInfo(confPath + "/" + string(includeList[k],0,i).c_str()).exists()) + { + + std::string fileName= (confPath + "/" + string(includeList[k],0,i).c_str()).toUtf8().constData(); - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig2(fileName); - Common::MediaticData::MediaticData::changeable().initEntityTypes(lpconfig2); + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig2(fileName); + Common::MediaticData::MediaticData::changeable().initEntityTypes(lpconfig2); + break; + } + } } - } else { + } + else + { EntityGroupId groupId=addEntityGroup(groupName); #ifdef DEBUG_CD LDEBUG << "initEntityTypes: id is " << groupId; @@ -768,15 +813,14 @@ EntityType MediaticData::getEntityType(const EntityGroupId groupId, MDATALOGINIT; LERROR << "MediaticData::getEntityType unknown entity group id " << groupId <<"accessing" << entityName; - throw LimaException(); + throw LimaException("MediaticData::getEntityType unknown entity group id"); } try { return EntityType(m_d->m_entityTypes[groupId]->get(entityName),groupId); } - catch(LimaException& ) { + catch(LimaException& e) { MDATALOGINIT; - LWARN << "Unknown entity type " - << entityName; + LWARN << "Unknown entity type " << entityName << "in group id:"<second; } +#ifdef ANTINNO_SPECIFIC + Q_FOREACH(auto entityType, m_entityTypes) +#else + for (auto entityType: m_entityTypes) +#endif + { + delete entityType; + } + for (auto it = m_stringsPool.begin(); it != m_stringsPool.end(); it++) + { + delete it->second; + } } const LimaString& MediaticData::getEntityTypeNameSeparator() const diff --git a/lima_common/src/common/MediaticData/mediaticData.h b/lima_common/src/common/MediaticData/mediaticData.h index fb755b50b..33f74d47b 100644 --- a/lima_common/src/common/MediaticData/mediaticData.h +++ b/lima_common/src/common/MediaticData/mediaticData.h @@ -86,8 +86,6 @@ class LIMA_MEDIATICDATA_EXPORT MediaticData : public Singleton void initMediaData(MediaId med); - void initXMLParser(); - const FsaStringsPool& stringsPool(MediaId med) const; FsaStringsPool& stringsPool(MediaId med); diff --git a/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp b/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp new file mode 100644 index 000000000..7abc26134 --- /dev/null +++ b/lima_common/src/common/MediaticData/tests/MediaticDataTest.cpp @@ -0,0 +1,112 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE MediaticData +#include + +#include "common/MediaticData/mediaticData.h" + +#include "common/time/traceUtils.h" +#include "common/QsLog/QsLog.h" +#include "common/QsLog/QsLogDest.h" +#include "common/QsLog/QsLogCategories.h" +#include "common/QsLog/QsDebugOutput.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" + +using namespace Lima; + +// conversion functions +BOOST_AUTO_TEST_CASE( MediaticDataTest ) +{ + QsLogging::initQsLog(); + Lima::AmosePluginsManager::single(); + + std::string resourcesPath; + std::string configDir; + std::string commonConfigFile("lima-common.xml"); + std::deque langs; + langs.push_front("fre"); + + resourcesPath = std::string (qgetenv("LIMA_RESOURCES").constData()==0?"":qgetenv("LIMA_RESOURCES").constData()); + if (resourcesPath.empty()) + { + resourcesPath = "/usr/share/apps/lima/resources/"; + } + std::cerr << "MediaticData0: resourcesPath=" << resourcesPath << std::endl; + + configDir = std::string (qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData()); + if (configDir.empty()) + { + configDir = "/usr/share/config/lima"; + } + std::cerr << "MediaticData0: configDir=" << configDir << std::endl; + + // initialize common + Common::MediaticData::MediaticData::changeable().init( + resourcesPath, + configDir, + commonConfigFile, + langs); + + // use setter: Create LinguisticProcessing.IDIOM and LinguisticProcessing.SYNTACTIC_RELATION + LimaString groupName1("LinguisticProcessing"); + Common::MediaticData::EntityGroupId group1 = Common::MediaticData::MediaticData::changeable().addEntityGroup(groupName1); + LimaString entityName11("IDIOM"); + LimaString entityName12("SYNTACTIC_RELATION"); + Common::MediaticData::EntityType type11 = Common::MediaticData::MediaticData::changeable().addEntity(groupName1,entityName11); + Common::MediaticData::EntityType type12 = Common::MediaticData::MediaticData::changeable().addEntity(groupName1,entityName12); + + // use setter: Create Location.CITYand Location.COUNTRY + LimaString groupName2("Location"); + Common::MediaticData::EntityGroupId group2 = Common::MediaticData::MediaticData::changeable().addEntityGroup(groupName2); + LimaString entityName21("CITY"); + LimaString entityName22("COUNTRY"); + Common::MediaticData::EntityType type21 = Common::MediaticData::MediaticData::changeable().addEntity(groupName2,entityName21); + Common::MediaticData::EntityType type22 = Common::MediaticData::MediaticData::changeable().addEntity(groupName2,entityName22); + + // test getter: get groupId from name + Lima::Common::MediaticData::EntityGroupId groupId2 = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName2); + std::cerr << "groupName2 = " << groupName1 << ", groupId2 = " << groupId2 << std::endl; + BOOST_REQUIRE( groupId2 == group2 ); + // test getter: get groupName from groupId + LimaString groupName22 = Common::MediaticData::MediaticData::single().getEntityGroupName(groupId2); + BOOST_REQUIRE( groupName2 == groupName22); + + // test getter: get groupId from name + Lima::Common::MediaticData::EntityGroupId groupId1 = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName1); + std::cerr << "groupName1 = " << groupName1 << ", groupId1 = " << groupId1 << std::endl; + BOOST_REQUIRE( groupId1 == group1 ); + // test getter: get groupName from groupId + BOOST_REQUIRE( groupName1 == Common::MediaticData::MediaticData::single().getEntityGroupName(groupId1)); + + // test getter: get entity name from entity + LimaString name11 = Common::MediaticData::MediaticData::single().getEntityName(type11); + LimaString qualifiedEntityName11("LinguisticProcessing.IDIOM"); + std::cerr << "name11 = " << name11 << std::endl; + BOOST_REQUIRE(name11==qualifiedEntityName11); + + // test getter: get entity type from name + LimaString qualifiedEntityName21("Location.CITY"); + LimaString simpleEntityName21("CITY"); + Lima::Common::MediaticData::EntityType entityType211 = Common::MediaticData::MediaticData::single().getEntityType(qualifiedEntityName21); + BOOST_REQUIRE(entityType211 == type21); + Lima::Common::MediaticData::EntityType entityType212 = Common::MediaticData::MediaticData::single().getEntityType(groupId2, simpleEntityName21); + BOOST_REQUIRE(entityType211 == type21); + + } diff --git a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp index fcfff1aca..aac674a08 100644 --- a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp +++ b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.cpp @@ -24,9 +24,15 @@ using namespace std; namespace Lima { - -AnalysisContent::AnalysisContent() : +AnalysisContent::AnalysisContent( +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& sa +#endif + ) : m_analysisData() +#ifdef ANTINNO_SPECIFIC + , _stopAnalyze(sa) +#endif {} AnalysisContent::~AnalysisContent() @@ -48,6 +54,12 @@ AnalysisContent::~AnalysisContent() LDEBUG << "AnalysisContent::~AnalysisContent all data deleted"; #endif } +#ifdef ANTINNO_SPECIFIC +StopAnalyze const& AnalysisContent::stopAnalyze() const +{ + return _stopAnalyze; +} +#endif AnalysisData* AnalysisContent::getData( const std::string& id) diff --git a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h index b50a560ce..6db8efc1c 100644 --- a/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h +++ b/lima_common/src/common/ProcessUnitFramework/AnalysisContent.h @@ -48,8 +48,11 @@ class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisData class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisContent { public: - - AnalysisContent(); + AnalysisContent( +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ); /** * Destroy all AnalysisData in AnalysisContent @@ -102,9 +105,15 @@ class LIMA_PROCESSUNITFRAMEWORK_EXPORT AnalysisContent */ void releaseData(const std::string& id); +#ifdef ANTINNO_SPECIFIC + StopAnalyze const& stopAnalyze() const; +#endif private: std::map m_analysisData; +#ifdef ANTINNO_SPECIFIC + Lima::StopAnalyze const& _stopAnalyze; +#endif }; diff --git a/lima_common/src/common/QsLog/QsLog.cpp b/lima_common/src/common/QsLog/QsLog.cpp index 83b2754b6..c75ad912a 100644 --- a/lima_common/src/common/QsLog/QsLog.cpp +++ b/lima_common/src/common/QsLog/QsLog.cpp @@ -34,6 +34,13 @@ #include #include + +#ifdef ANTINNO_SPECIFIC +#include +#include +#include +#endif + LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str ) { qd << str.c_str(); @@ -42,6 +49,69 @@ LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str namespace QsLogging { + +#ifdef ANTINNO_SPECIFIC + +namespace antinno { + +::boost::shared_ptr log; + + + + +Log4cpp::Log4cpp() +{ +} +void Log4cpp::configure(::std::string const& configFilePath) +{ + ::log4cpp::PropertyConfigurator::configure(configFilePath); + // todo : récupérer le vrai msg de l'erreur + if (!::log4cpp::Appender::reopenAll()) + { + ::std::ostringstream oss; + oss << "log4cpp::Appender::reopenAll() return false. Maybe a problem with file " << configFilePath; + throw ::std::exception(oss.str().data()); + } +} +bool Log4cpp::canWrite(CategoryId const& id, Level level) const +{ + return ::log4cpp::Category::getInstance(id).isPriorityEnabled(level); +} +void Log4cpp::writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String) +{ + ::log4cpp::Category::getInstance(id) << level << pNullTerminatedUtf8String; +} + + + + + +LogHelper::LogHelper(QsLogging::Level l, const QString& zone) + :_zone(zone.toStdString()), _level(l) +{ +} +::std::ostream& LogHelper::stream() +{ + return _stream; +} +LogHelper::~LogHelper() +{ + auto l = info; + switch(_level) + { + case QsLogging::TraceLevel: l = debug; break; + case QsLogging::DebugLevel: l = debug; break; + case QsLogging::InfoLevel: l = info; break; + case QsLogging::WarnLevel: l = warn; break; + case QsLogging::ErrorLevel: l = error; break; + case QsLogging::FatalLevel: l = fatal; break; + } + log->writeRecord(CategoryId(_zone.c_str()), l, _stream.str().c_str()); +} + +} +#endif + typedef QList DestinationList; static const char TraceString[] = "TRACE"; @@ -168,8 +238,8 @@ void Logger::Helper::writeToLog() QTextStream ts(&s); ts << QThread::currentThread(); const QString completeMessage(QString("%1 %2 %3 %4") - .arg(levelName, 5) .arg(QDateTime::currentDateTime().toString(fmtDateTime)) + .arg(levelName, 5) .arg(s) .arg(buffer) ); diff --git a/lima_common/src/common/QsLog/QsLog.h b/lima_common/src/common/QsLog/QsLog.h index 692e1d55f..ff4967728 100644 --- a/lima_common/src/common/QsLog/QsLog.h +++ b/lima_common/src/common/QsLog/QsLog.h @@ -38,9 +38,16 @@ #include "QsLog_export.h" +#ifdef ANTINNO_SPECIFIC +#include +#include +#endif + namespace QsLogging { + + enum Level { TraceLevel = 0, @@ -128,6 +135,61 @@ class LIMA_COMMONQSLOG_EXPORT Logger LIMA_COMMONQSLOG_EXPORT QDebug& operator<< (QDebug& qd, const std::string& str ); + + + + + +#ifdef ANTINNO_SPECIFIC + +namespace antinno { + +typedef ::std::string CategoryId; + +enum LIMA_COMMONQSLOG_EXPORT Level // identiques à ceux de log4cpp +{ + emerg = 0, fatal = 0, alert = 100, crit = 200, error = 300, warn = 400, notice = 500, info = 600, debug = 700 +}; + +class LIMA_COMMONQSLOG_EXPORT ILog +{ +public: + virtual void configure(::std::string const& configFilePath) = 0; + virtual bool canWrite(CategoryId const& id, Level level) const = 0; + virtual void writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String) = 0; +}; + + +class LIMA_COMMONQSLOG_EXPORT Log4cpp : public ILog +{ +public: + Log4cpp(); + void configure(::std::string const& configFilePath); + bool canWrite(CategoryId const& id, Level level) const; + void writeRecord(CategoryId const& id, Level level, char const* pNullTerminatedUtf8String); +}; + +extern LIMA_COMMONQSLOG_EXPORT ::boost::shared_ptr log; + +class LIMA_COMMONQSLOG_EXPORT LogHelper +{ +public: + explicit LogHelper(QsLogging::Level logLevel, const QString& zone); + ~LogHelper(); + ::std::ostream& stream(); +private: + QsLogging::Level const _level; + ::std::ostringstream _stream; + ::std::string const _zone; +}; + +} +#endif + + + + + } // end namespace //! Logging macros: define QS_LOG_LINE_NUMBERS to get the file and line number diff --git a/lima_common/src/common/QsLog/QsLogCategories.cpp b/lima_common/src/common/QsLog/QsLogCategories.cpp index 6ca3a46ff..755e0ddb3 100644 --- a/lima_common/src/common/QsLog/QsLogCategories.cpp +++ b/lima_common/src/common/QsLog/QsLogCategories.cpp @@ -1,184 +1,242 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -#include "QsLogCategories.h" -#include "common/tools/LimaFileSystemWatcher.h" - -#ifdef WIN32 -#pragma warning(disable: 4127) -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace Lima; - -namespace QsLogging -{ - -// static const int init = initQsLog(); - -class CategoriesImpl -{ -public: - CategoriesImpl() - { - } - QMap categories; - - LimaFileSystemWatcher m_configFileWatcher; -}; - -Categories::Categories(QObject* parent) : - QObject(parent), - d(new CategoriesImpl()) -{ - connect(&d->m_configFileWatcher,SIGNAL(fileChanged(QString)),this,SLOT(configureFileChanged(QString))); -} - -Categories::~Categories() -{ - delete d; -} - -void Categories::configureFileChanged ( const QString & path ) -{ - if (QFile(path).exists()) - { - configure(path); - } -} - -bool Categories::configure(const QString& fileName) -{ - QFile file(fileName); - QFileInfo fileInfo(fileName); - QDir configDir = fileInfo.dir(); - - if (configDir.exists("log4cpp")) - { - QString log4cppSubdirName = configDir.filePath("log4cpp"); - QFileInfo log4cppSubdirInfo(log4cppSubdirName); - if (log4cppSubdirInfo.isDir()) - { - QStringList nameFilters; - nameFilters << "log4cpp.*.properties"; - QDir log4cppSubdir(log4cppSubdirName); - QFileInfoList configFiles = log4cppSubdir.entryInfoList(nameFilters); - Q_FOREACH(QFileInfo configFile, configFiles) - { - configure(configFile.absoluteFilePath()); - } - } - } - - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Unable to open qslog configuration file: " << fileName.toUtf8().data() << std::endl; - return false; - } - d->m_configFileWatcher.addPath(fileName); - - bool res = true; - QTextStream in(&file); - QString line = in.readLine(); - while (!line.isNull()) - { - if (!line.startsWith("#")) - { - QStringList elts = line.split("="); - if (elts.size()==2 && elts.at(0).trimmed().startsWith("log4j.category.")) - { - QString category = elts.at(0).trimmed().remove(0,QString("log4j.category.").size()); - QString value = elts.at(1).trimmed(); - if (value == "TRACE") - d->categories.insert(category,QsLogging::TraceLevel); - else if (value == "DEBUG") - d->categories.insert(category,QsLogging::DebugLevel); - else if (value == "INFO") - d->categories.insert(category,QsLogging::InfoLevel); - else if (value == "WARN") - d->categories.insert(category,QsLogging::WarnLevel); - else if (value == "ERROR") - d->categories.insert(category,QsLogging::ErrorLevel); - else if (value == "FATAL") - d->categories.insert(category,QsLogging::FatalLevel); - else - { - std::cerr << "Error reading " << fileName.toUtf8().constData() << ": unknow level " << value.toUtf8().constData() << ". Using TRACE" << std::endl; - res = false; - d->categories.insert(category,QsLogging::TraceLevel); - } - } - else if (elts.size()==2 && elts.at(0).trimmed() == "include") - { - QString includedFileName = elts.at(1).trimmed(); - QString includedInitFileName = includedFileName; - if (!QFileInfo(includedInitFileName).isAbsolute()) - { - includedInitFileName = configDir.filePath(includedInitFileName); - } - configure(includedInitFileName); - } - } - line = in.readLine(); - } - return res; -} - -Level Categories::levelFor(const QString& category) const -{ -#ifdef DEBUG_CD - // Do not compile this costly check in release - if (!d->categories.contains(category)) - { - std::cerr << "Error: unknown category. Using TRACE for " << category.toUtf8().constData() << std::endl; - } -#endif - return d->categories.value(category, QsLogging::TraceLevel); -} - -LIMA_COMMONQSLOG_EXPORT int initQsLog(const QString& configDir) { - try { - QString initFileName = (configDir.isEmpty() ? - QString::fromUtf8(qgetenv("LIMA_CONF").isEmpty() ? - "/usr/share/config/lima" : - qgetenv("LIMA_CONF").constData()) : - configDir ) + "/log4cpp.properties"; - if (!QsLogging::Categories::instance().configure(initFileName)) - { - std::cerr << "Configure Problem " << initFileName.toUtf8().constData() << std::endl; - return -1; - } - // } -} catch(...) { - std::cerr << "Exception during logging system configuration" << std::endl; - return -1; -} -return 0; -} - -} // end namespace - +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +#include "QsLogCategories.h" +#include "common/tools/LimaFileSystemWatcher.h" +#include "common/tools/FileUtils.h" + +#ifdef WIN32 +#pragma warning(disable: 4127) +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Lima; +using namespace Lima::Common::Misc; + +namespace QsLogging +{ + +// static const int init = initQsLog(); + +class CategoriesImpl +{ +public: + CategoriesImpl() + { + } + QMap categories; + + LimaFileSystemWatcher m_configFileWatcher; +}; + +Categories::Categories(QObject* parent) : + QObject(parent), + d(new CategoriesImpl()) +{ + connect(&d->m_configFileWatcher,SIGNAL(fileChanged(QString)),this,SLOT(configureFileChanged(QString))); + QString category = "FilesReporting"; +#ifdef DEBUG_CD + d->categories.insert(category,QsLogging::InfoLevel); +#else + d->categories.insert(category,QsLogging::ErrorLevel); +#endif +} + +Categories::~Categories() +{ + delete d; +} + +void Categories::configureFileChanged ( const QString & path ) +{ + if (QFile(path).exists()) + { + configure(path); + } +} + +bool Categories::configure(const QString& fileName) +{ + QFile file(fileName); + QFileInfo fileInfo(fileName); + QDir configDir = fileInfo.dir(); + +// if (configDir.exists("log4cpp")) +// { +// QString log4cppSubdirName = configDir.filePath("log4cpp"); +// QFileInfo log4cppSubdirInfo(log4cppSubdirName); +// if (log4cppSubdirInfo.isDir()) +// { +// QStringList nameFilters; +// nameFilters << "log4cpp.*.properties"; +// QDir log4cppSubdir(log4cppSubdirName); +// QFileInfoList configFiles = log4cppSubdir.entryInfoList(nameFilters); +// Q_FOREACH(QFileInfo configFile, configFiles) +// { +// configure(configFile.absoluteFilePath()); +// } +// } +// } + + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Unable to open qslog configuration file: " << fileName.toUtf8().data() << std::endl; + return false; + } + d->m_configFileWatcher.addPath(fileName); + + bool res = true; + QTextStream in(&file); + QString line = in.readLine(); + while (!line.isNull()) + { + if (!line.startsWith("#")) + { + QStringList elts = line.split("="); + if (elts.size()==2 && elts.at(0).trimmed().startsWith("log4j.category.")) + { + QString category = elts.at(0).trimmed().remove(0,QString("log4j.category.").size()); + QString value = elts.at(1).trimmed(); + if (value == "TRACE") + d->categories.insert(category,QsLogging::TraceLevel); + else if (value == "DEBUG") + d->categories.insert(category,QsLogging::DebugLevel); + else if (value == "INFO") + d->categories.insert(category,QsLogging::InfoLevel); + else if (value == "WARN") + d->categories.insert(category,QsLogging::WarnLevel); + else if (value == "ERROR") + d->categories.insert(category,QsLogging::ErrorLevel); + else if (value == "FATAL") + d->categories.insert(category,QsLogging::FatalLevel); + else + { + std::cerr << "Error reading " << fileName.toUtf8().constData() << ": unknow level " << value.toUtf8().constData() << ". Using TRACE" << std::endl; + res = false; + d->categories.insert(category,QsLogging::TraceLevel); + } + } + else if (elts.size()==2 && elts.at(0).trimmed() == "include") + { + QString includedFileName = elts.at(1).trimmed(); + QString includedInitFileName = includedFileName; + if (!QFileInfo(includedInitFileName).isAbsolute()) + { + includedInitFileName = configDir.filePath(includedInitFileName); + } + configure(includedInitFileName); + } + } + line = in.readLine(); + } + LOGINIT("FilesReporting"); + LINFO << "QsLog conf file loaded:" << fileName; + return res; +} + +Level Categories::levelFor(const QString& category) const +{ +#ifdef DEBUG_CD + // Do not compile this costly check in release + if (!d->categories.contains(category)) + { + std::cerr << "Error: unknown category. Using TRACE for " << category.toUtf8().constData() << std::endl; + } +#endif + return d->categories.value(category, QsLogging::TraceLevel); +} + +LIMA_COMMONQSLOG_EXPORT int initQsLog(const QString& configString) +{ + bool atLeastOneSuccessfulLoad = false; + QStringList configDirsList; + if (configString.isEmpty()) + { + configDirsList = buildConfigurationDirectoriesList(QStringList()<<"lima",QStringList()); + } + else + { + configDirsList = configString.split(LIMA_PATH_SEPARATOR); + } + try + { + while (! configDirsList.isEmpty() ) + { + QString configDir = configDirsList.last(); + configDirsList.pop_back(); + QDir initDir( configDir + "/log4cpp"); + if (initDir.exists()) + { + QStringList entryList = initDir.entryList(QDir::Files); + Q_FOREACH(QString entry, entryList) + { + if (QsLogging::Categories::instance().configure(configDir + "/log4cpp/" + entry)) + { + atLeastOneSuccessfulLoad = true; + } + else + { + std::cerr << "Configure Problem " << entry.toUtf8().constData() << std::endl; + return -1; + } + } + } + QString initFileName = configDir + "/log4cpp.properties"; +#ifdef ANTINNO_BUGFIX + // QFileInfo::exists(...) ne fonctionne pas avec qt 4.8 + if (QFileInfo(initFileName).exists()) +#else + if (QFileInfo::exists(initFileName)) +#endif + { + if (QsLogging::Categories::instance().configure(initFileName)) + { + atLeastOneSuccessfulLoad = true; + } + else + { + std::cerr << "Configure Problem " << initFileName.toUtf8().constData() << std::endl; + return -1; + } + } + } + } + catch(...) + { + std::cerr << "Exception during logging system configuration" << std::endl; + return -1; +} + if (!atLeastOneSuccessfulLoad) + { + std::cerr << "Configure Problem no configure file has been found in" << configString.toStdString() << std::endl; + return -1; + } +return 0; +} + +} // end namespace + diff --git a/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp b/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp index fac20d6b2..4351377c2 100644 --- a/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp +++ b/lima_common/src/common/XMLConfigurationFiles/moduleConfigurationStructure.cpp @@ -110,12 +110,12 @@ string& ModuleConfigurationStructure::getParamValueAtKeyOfGroupNamed(const std:: } catch (NoSuchGroup& nsg) { - LWARN << "Getting param '"<module that was not found @@ -79,8 +77,6 @@ namespace Lima { //! @param name the group that was not found NoSuchGroup ( const std::string &name ) : XMLConfigurationFileException ( "No such group " + name ),groupName ( name ) {/*std::cout << "No such group " << name << std::endl;*/} virtual ~NoSuchGroup() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such group " + groupName );} private: NoSuchGroup& operator=(const NoSuchGroup&) {return *this;} //! @brief the name of the group that was not found @@ -95,8 +91,6 @@ namespace Lima { //! @param name the attribute that was not found NoSuchAttribute ( const std::string &name ) : XMLConfigurationFileException ( "No such attribute " + name ),attName ( name ) {/*std::cout << "No such attribute " << name << std::endl;*/} virtual ~NoSuchAttribute() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such attribute " + attName );} private: NoSuchAttribute& operator=(const NoSuchAttribute&) {return *this;} //! @brief the name of the attribute that was not found @@ -111,8 +105,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchParam : public XMLConfigurationFil //! @param name the param that was not found NoSuchParam ( const std::string &name ) : XMLConfigurationFileException ( "No such param '" + name + "'" ),paramName ( name ) {/*std::cout << "No such param " << name << std::endl;*/} virtual ~NoSuchParam() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such param '" + paramName + "'" );} private: NoSuchParam& operator=(const NoSuchParam&) {return *this;} //! @brief the name of the param that was not found @@ -127,8 +119,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchList : public XMLConfigurationFile //! @param name the list that was not found NoSuchList ( const std::string &name ) : XMLConfigurationFileException ( "No such list " + name ),listName ( name ) {/*std::cout << "No such list " << name << std::endl;*/} virtual ~NoSuchList() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such list " + listName );} private: NoSuchList& operator=(const NoSuchList&) {return *this;} //! @brief the name of the list that was not found @@ -143,8 +133,6 @@ class LIMA_XMLCONFIGURATIONFILES_EXPORT NoSuchMap : public XMLConfigurationFileE //! @param name the map that was not found NoSuchMap ( const std::string &name ) : XMLConfigurationFileException ( "No such map " + name ),mapName ( name ) {/*std::cout << "No such map " << name << std::endl;*/} virtual ~NoSuchMap() throw() {} - //! @brief return the message error - const std::string what() {return ( "No such map " + mapName );} private: NoSuchMap& operator=(const NoSuchMap&) {return *this;} //! @brief the name of the map that was not found diff --git a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp index 68b64bc39..fb634df8f 100644 --- a/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp +++ b/lima_common/src/common/XMLConfigurationFiles/xmlConfigurationFileParser.cpp @@ -113,6 +113,11 @@ XMLConfigurationFileParserPrivate::XMLConfigurationFileParserPrivate(const strin LERROR << "Error parsing " << m_configurationFileName.c_str(); throw XMLException(std::string("XMLConfigurationFileParser Unable to parse ") + m_configurationFileName + " : " + m_parser->errorHandler()->errorString().toUtf8().constData()); } + { + LOGINIT("FilesReporting"); + LINFO << "File parsed:" << m_configurationFileName; + } + } XMLConfigurationFileParserPrivate::~XMLConfigurationFileParserPrivate() @@ -176,21 +181,21 @@ string& XMLConfigurationFileParser::getModuleGroupParamValue(const string& modul } catch(NoSuchModule& nsm) { - std::cerr << nsm.what().c_str() << " " << m_d->m_configurationFileName.c_str() << std::endl; - LWARN << nsm.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsm.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsm.what() << " " << m_d->m_configurationFileName.c_str(); //not LERROR because user may want the module to be optional -> no error throw; } catch(NoSuchGroup& nsg) { - std::cerr << nsg.what().c_str() << " " << m_d->m_configurationFileName.c_str() << std::endl; - LWARN << nsg.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsg.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsg.what() << " " << m_d->m_configurationFileName.c_str(); throw; } catch(NoSuchParam& nsp) { - std::cerr << nsp.what().c_str() << " " << m_d->m_configurationFileName.c_str() << std::endl; - LWARN << nsp.what().c_str() << " " << m_d->m_configurationFileName.c_str(); + std::cerr << nsp.what() << " " << m_d->m_configurationFileName.c_str() << std::endl; + LWARN << nsp.what() << " " << m_d->m_configurationFileName.c_str(); throw; } catch(...) diff --git a/lima_common/src/common/misc/AbstractAccessIterators.cpp b/lima_common/src/common/misc/AbstractAccessIterators.cpp index d4fab5249..cd173e76e 100644 --- a/lima_common/src/common/misc/AbstractAccessIterators.cpp +++ b/lima_common/src/common/misc/AbstractAccessIterators.cpp @@ -64,11 +64,6 @@ AccessSubWordIterator& AccessSubWordIterator::operator=(const AccessSubWordItera AccessSubWordIterator::~AccessSubWordIterator() { -#ifdef DEBUG_CD - STRINGMAPLOGINIT; - LDEBUG << this << ": AccessSubWordIterator::~AccessSubWordIterator() " - ; -#endif delete m_delegate; } @@ -106,11 +101,6 @@ AccessSuperWordIterator& AccessSuperWordIterator::operator=(const AccessSuperWor } AccessSuperWordIterator::~AccessSuperWordIterator() { -#ifdef DEBUG_CD - STRINGMAPLOGINIT; - LDEBUG << this << ": AccessSuperWordIterator::~AccessSuperWordIterator() " - ; -#endif delete m_delegate; } diff --git a/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc b/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc index ac32e0047..9eedf40d1 100644 --- a/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc +++ b/lima_common/src/common/misc/DoubleAccessObjectToIdMap.tcc @@ -58,7 +58,7 @@ get(const Object& val) const { typename DoubleAccessObjectToIdMap::AccessMap::const_iterator it=m_accessMap.find(&val); if (it==m_accessMap.end()) { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::get(val) parameter not in map."); } else { return (*it).second; @@ -71,7 +71,7 @@ get(const Id& id) const { size_t i=(size_t) id; if (i >= m_reverseAccessMap.size()) { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::get(id) parameter not in reverse map."); } else { return *(m_reverseAccessMap[i]); @@ -115,7 +115,7 @@ operator[](const Object& val) return (*inserted).second; } else { - throw LimaException(); + throw LimaException("DoubleAccessObjectToIdMap::operator[](val) parameter not in map"); } } else { diff --git a/lima_common/src/common/misc/stringspool.cpp b/lima_common/src/common/misc/stringspool.cpp index 88e7ddebc..ddf5dbe16 100644 --- a/lima_common/src/common/misc/stringspool.cpp +++ b/lima_common/src/common/misc/stringspool.cpp @@ -271,6 +271,11 @@ void StringsPoolPrivate::clear() // reinit from pos to the end void StringsPoolPrivate::clear(const uint64_t pos) { + // reinitialize hashPool + // WARNING: The m_hashPool hash table contains the same pointer as the m_vecPool + // vector. So, override its content BEFORE free memory to avoid crash (on Windows) + m_hashPool=m_resourcesHashPool; + // STRPOOLLOGINIT; // LDEBUG << "clearing StringsPool"; uint64_t i(pos),size(m_vecPool.size()); @@ -280,8 +285,6 @@ void StringsPoolPrivate::clear(const uint64_t pos) m_vecPool[i] = 0; } m_vecPool.resize(pos); - // reinitialize hashPool - m_hashPool=m_resourcesHashPool; } #ifndef WIN32 diff --git a/lima_common/src/common/misc/stringspool.h b/lima_common/src/common/misc/stringspool.h index c16826adc..935236d01 100644 --- a/lima_common/src/common/misc/stringspool.h +++ b/lima_common/src/common/misc/stringspool.h @@ -42,7 +42,7 @@ #include #endif #endif -#include +#include namespace Lima { diff --git a/lima_common/src/common/time/timeUtilsController.cpp b/lima_common/src/common/time/timeUtilsController.cpp index 784bde39f..efe95e1ea 100644 --- a/lima_common/src/common/time/timeUtilsController.cpp +++ b/lima_common/src/common/time/timeUtilsController.cpp @@ -41,7 +41,12 @@ TimeUtilsController::~TimeUtilsController() { uint64_t delta = TimeUtils::elapsedTime(m_topic); if (m_logElapsedTime) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + // FWI 09/11/2015 gestion temps en microsecondes sous windows + LINFO << m_topic << " ( ): " << delta << " us"; +#else LINFO << m_topic << " ( ): " << delta << " ms"; +#endif } } diff --git a/lima_common/src/common/time/traceUtils.cpp b/lima_common/src/common/time/traceUtils.cpp index 4249a03cf..306ecef14 100644 --- a/lima_common/src/common/time/traceUtils.cpp +++ b/lima_common/src/common/time/traceUtils.cpp @@ -29,30 +29,105 @@ #include "traceUtils.h" #include +#ifdef ANTINNO_SPECIFIC +// FWI 28/10/2015 modifs pour utiliser une horloge plus précise (en us au lieu de ms) sous windows +// + ajout d'un compteur + +#ifdef WIN32 +#include "Windows.h" +#include + + +LARGE_INTEGER m_f; +static bool m_freqInit = false; + +namespace +{ + uint64_t _winTime() + { + LARGE_INTEGER i; + if (m_freqInit == false) + { + QueryPerformanceFrequency(&m_f); + m_freqInit = true; + } + QueryPerformanceCounter(&i); + + return (i.QuadPart * 1000000) / m_f.QuadPart; // microseconds + } +} +#else +#error no implementation for non-win32 systems +#endif +#endif + + namespace Lima { //********************************************************************** //initialization of static members //********************************************************************** // uint64_t TimeUtils::currentTime={0,0}; +#ifdef ANTINNO_SPECIFIC +std::map TimeUtils::m_cumulatedTime = std::map(); +#else std::map > TimeUtils::m_cumulatedTime = std::map >(); +#endif QMutex TimeUtils::m_mutex; + +#ifdef ANTINNO_SPECIFIC +TimeUtils::TimeUtils() +{ +} +#endif //********************************************************************** // member functions //********************************************************************** -uint64_t TimeUtils::getCurrentTime() { +uint64_t TimeUtils::getCurrentTime() { +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + return _winTime(); +#else +#error no implementation for non-win32 systems +#endif +#else return QDateTime::currentMSecsSinceEpoch(); +#endif } +#ifdef ANTINNO_SPECIFIC +// FWI 03/11/24 nouvelle méthode pour remettre à zéro le cumul +void TimeUtils::restart( const std::string& taskCategory) +{ + QMutexLocker locker(&m_mutex); +#ifdef WIN32 + m_cumulatedTime[taskCategory].first = _winTime(); + //cout << "updateCurrentTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#else + m_cumulatedTime[taskCategory].first = QDateTime::currentMSecsSinceEpoch(); +#endif + m_cumulatedTime[taskCategory].second = 0; + m_cumulatedTime[taskCategory].count = 0; +} +#endif void TimeUtils::updateCurrentTime( const std::string& taskCategory ) { QMutexLocker locker(&m_mutex); +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + m_cumulatedTime[taskCategory].first = _winTime(); + //cout << "updateCurrentTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#else +#error no implementation for non-win32 systems +#endif +#else m_cumulatedTime[taskCategory].first = QDateTime::currentMSecsSinceEpoch(); +#endif } // void TimeUtils::updateCurrentTime() { -// boost::mutex::scoped_lock(m_mutex); +// ::boost::mutex::scoped_lock(m_mutex); // gettimeofday(¤tTime,0); // } @@ -67,10 +142,32 @@ uint64_t TimeUtils::diffTime(const uint64_t& begin, } uint64_t TimeUtils::elapsedTime(const std::string& taskCategory) { +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + uint64_t newTime = _winTime(); +#else +#error no implementation for non-win32 systems +#endif +#else uint64_t newTime = QDateTime::currentMSecsSinceEpoch(); +#endif + //cout << "newTime=" << newTime << ::std::endl; + //cout << "oldTime=" << m_cumulatedTime[taskCategory].first << ::std::endl; +#ifdef ANTINNO_SPECIFIC +#ifdef WIN32 + uint64_t delta = newTime - m_cumulatedTime[taskCategory].first; +#else +#error no implementation for non-win32 systems +#endif +#else uint64_t delta = diffTime(m_cumulatedTime[taskCategory].first,newTime); +#endif + //cout << "delta=" << delta << ::std::endl; m_cumulatedTime[taskCategory].second += delta; m_cumulatedTime[taskCategory].first = newTime; +#ifdef ANTINNO_SPECIFIC + ++m_cumulatedTime[taskCategory].count; + #endif return delta; } @@ -80,7 +177,11 @@ uint64_t TimeUtils::elapsedTime(const std::string& taskCategory) { void TimeUtils::logElapsedTime(const std::string& mess, const std::string& taskCategory) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + LINFO << mess << "(" << taskCategory << "): " << TimeUtils::elapsedTime(taskCategory) << " us"; +#else LINFO << mess << "(" << taskCategory << "): " << TimeUtils::elapsedTime(taskCategory) << " ms"; +#endif } /** @@ -89,15 +190,27 @@ void TimeUtils::logElapsedTime(const std::string& mess, void TimeUtils::logCumulatedTime(const std::string& mess, const std::string& taskCategory) { TIMELOGINIT; +#ifdef ANTINNO_SPECIFIC + LINFO << std::setfill('0') << std::setw(9) << m_cumulatedTime[taskCategory].second << " us" + << " count : " << std::setfill('0') << std::setw(6) << m_cumulatedTime[taskCategory].count << ": " << mess; +#else LINFO << mess << ": " << m_cumulatedTime[taskCategory].second << " ms"; +#endif } void TimeUtils::logAllCumulatedTime(const std::string& mess) { TIMELOGINIT; LINFO << mess << ": "; + +#ifdef ANTINNO_SPECIFIC + for( std::map::const_iterator it = m_cumulatedTime.begin() ; + it != m_cumulatedTime.end() ; it++ ) { + LINFO << it->first << ":" << it->second.second << " us" << " count: " << it->second.count; +#else for( std::map >::const_iterator it = m_cumulatedTime.begin() ; it != m_cumulatedTime.end() ; it++ ) { LINFO << it->first << ":" << it->second.second << " ms" ; +#endif } } diff --git a/lima_common/src/common/time/traceUtils.h b/lima_common/src/common/time/traceUtils.h index 641a87afb..137ae10ef 100644 --- a/lima_common/src/common/time/traceUtils.h +++ b/lima_common/src/common/time/traceUtils.h @@ -54,7 +54,16 @@ namespace Lima { class LIMA_TIME_EXPORT TimeUtils { public: - TimeUtils() {} +#ifdef ANTINNO_SPECIFIC + // FWI 04/11/2015 ajout classe + struct Data + { + uint64_t first; + uint64_t second; + uint64_t count; + }; +#endif + TimeUtils(); ~TimeUtils() {} /** @@ -64,6 +73,10 @@ namespace Lima { */ // static void updateCurrentTime( const std::string& taskCategory = std::string("") ); static void updateCurrentTime( const std::string& taskCategory = std::string("") ); +#ifdef ANTINNO_SPECIFIC + // FWI 03/11/24 nouvelle méthode + static void restart( const std::string& taskCategory = std::string("") ); +#endif // static void setCurrentTime(uint64_t time); static void setCurrentTime(uint64_t time, const std::string& taskCategory = std::string("")); @@ -111,8 +124,14 @@ namespace Lima { private: /** last current time stored */ // static uint64_t currentTime; +#ifdef ANTINNO_SPECIFIC + // FWI 04/11/2015 remplacement de pair par Data + static std::map m_cumulatedTime; +#else static std::map > m_cumulatedTime; +#endif static QMutex m_mutex; + }; } // end namespace diff --git a/lima_common/src/common/tools/FileUtils.cpp b/lima_common/src/common/tools/FileUtils.cpp new file mode 100644 index 000000000..155d23de6 --- /dev/null +++ b/lima_common/src/common/tools/FileUtils.cpp @@ -0,0 +1,201 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file FileUtils.h + * @author Gael de Chalendar + * @date Tue Jul 7 2015 + * copyright Copyright (C) 2015 by CEA LIST + ***********************************************************************/ + +#include "FileUtils.h" +#ifdef ANTINNO_SPECIFIC +#include +#endif +#include +#include + +namespace Lima { +namespace Common { +namespace Misc { + +uint64_t countLines(std::istream& file) +{ + uint64_t result = 0; + std::streampos initialPosition = file.tellg(); + int c = file.get(); + while (c != -1) + { + while (c != -1 && c != '\n') + { + c = file.get(); + } + result = result + 1; + c = file.get(); + } + file.clear(); + file.seekg(initialPosition, std::ios_base::beg); + return result; +} + +uint64_t countLines(QFile& file) +{ + uint64_t result = 0; + qint64 initialPosition = file.pos(); + char c = '\0'; + while (!file.atEnd()) + { + while (!file.atEnd() && c != '\n') + { + file.getChar(&c); + } + result = result + 1; + file.getChar(&c); + } + file.seek(initialPosition); + return result; +} + +QStringList buildConfigurationDirectoriesList(const QStringList& projects, const QStringList& paths) +{ + QStringList configDirs; +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& project, projects) +#else + for (const QString& project: projects) +#endif + { + QStringList confDirs; + QString projectConf = QString::fromUtf8(qgetenv((project.toUpper()+"_CONF").toStdString().c_str()).constData()); + if (!projectConf.isEmpty()) + confDirs << projectConf.split(LIMA_PATH_SEPARATOR); +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString &configDir, confDirs) +#else + for (const QString &configDir: confDirs ) +#endif + { + if (!configDir.isEmpty() && QDir(configDir).exists()) + { + configDirs << configDir; + } + } + if (confDirs.isEmpty()) + { + QString configDir = QString::fromUtf8(qgetenv((project.toUpper()+"_DIST").toStdString().c_str()).constData()) + "/share/config/" + project; + if (!configDir.isEmpty() && QDir( configDir ).exists() ) + { + configDirs << configDir; + } + else + { + configDir = QString::fromUtf8("/usr/share/config/") + project; + if (!configDir.isEmpty() && QDir( configDir ).exists() ) + { + configDirs << configDir; + } + } + } + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& path, paths) +#else + for (const QString& path: paths) +#endif + { + if (!path.isEmpty() && QDir(path).exists()) + configDirs << path; + } + + return configDirs; +} + +QStringList buildResourcesDirectoriesList(const QStringList& projects, const QStringList& paths) +{ + QStringList resourcesDirs; +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& project, projects) +#else + for (const QString& project: projects) +#endif + { + QStringList resDirs; + QString projectRes = QString::fromUtf8(qgetenv((project.toUpper()+"_RESOURCES").toStdString().c_str()).constData()); + if (!projectRes.isEmpty()) + resDirs << projectRes.split(LIMA_PATH_SEPARATOR); +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString &resourcesDir, resDirs) +#else + for (const QString &resourcesDir: resDirs ) +#endif + { + if (QDir(resourcesDir).exists()) + { + resourcesDirs << resourcesDir; + } + } + if (resDirs.isEmpty()) + { + QString resourcesDir = QString::fromUtf8(qgetenv((project.toUpper()+"_DIST").toStdString().c_str()).constData()) + "/share/apps/" + project + "/resources"; + if ( QDir( resourcesDir ).exists() ) + { + resourcesDirs << resourcesDir; + } + else + { + resourcesDir = QString::fromUtf8("/usr/share/apps/") + project + "/resources"; + if ( QDir( resourcesDir ).exists() ) + { + resourcesDirs << resourcesDir; + } + } + } + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH(const QString& path, paths) +#else + for (const QString& path: paths) +#endif + { + if (QDir(path).exists()) + resourcesDirs << path; + } + + return resourcesDirs; +} + +QString findFileInPaths(const QString& paths, const QString& fileName, const QChar& separator) +{ + QStringList pathsList = paths.split(separator); + Q_FOREACH(QString path, pathsList) + { + if (QFileInfo(path+ "/" + fileName).exists()) + { + return path+ "/" + fileName; + } + } + std::cerr << "WARNING: findFileInPaths no '" << fileName.toUtf8().constData() + << "' found in '" << paths.toUtf8().constData() + << "' separated by '" << separator.toLatin1() << "'" << std::endl; + return QString(); +} + + +} // end namespace +} // end namespace +} // end namespace diff --git a/lima_common/src/common/tools/FileUtils.h b/lima_common/src/common/tools/FileUtils.h new file mode 100644 index 000000000..a38507970 --- /dev/null +++ b/lima_common/src/common/tools/FileUtils.h @@ -0,0 +1,130 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file FileUtils.h + * @author Gael de Chalendar + * @date Tue Jul 7 2015 + * copyright Copyright (C) 2015 by CEA LIST + ***********************************************************************/ + +#ifndef LIMA_COMMON_MISC_FILEUTILS_H +#define LIMA_COMMON_MISC_FILEUTILS_H + +#include "common/LimaCommon.h" + +#include +#ifdef ANTINNO_BUGFIX +// nécessaire sinon le compilateur dit que QStringList n'a pas de constructeur... +#include +#endif + +#include + +#ifdef WIN32 +#ifdef ANTINNO_SPECIFIC +// nécessaire sinon on a une erreur c2664 : impossible de convertir de 'char' à 'Qstring' dans 'QString::join' +static QChar const LIMA_PATH_SEPARATOR(';'); +#else +#define LIMA_PATH_SEPARATOR ';' +#endif +#else +#define LIMA_PATH_SEPARATOR ':' +#endif + +namespace Lima { +namespace Common { +namespace Misc { + +/** + * Count the number of lines in the given file from the current position + * + * If the last line has no character (no character after the last line break)' it is not counted. + * After this function, the file is in the same good state and at the same position. + * + * @param file the file to count the lines of + * + * @return the number of lines of the file + */ +LIMA_COMMONTOOLS_EXPORT uint64_t countLines(std::istream& file); + +/** + * Count the number of lines in the given file from the current position + * + * If the last line has no character (no character after the last line break)' it is not counted. + * After this function, the file is at the same position. + * + * @param file the file to count the lines of + * + * @return the number of lines of the file + */ +LIMA_COMMONTOOLS_EXPORT uint64_t countLines(QFile& file); + + +/** + * @brief Build a list of configuration directories from a list of project + * names and a list of paths. + * + * For each project name "project", try to add the dir from the environment + * variable $PROJECT_CONF. If it does not exist, try + * $PROJECT_DIST/share/config/project. If it does not exist either, try + * /usr/share/config/project. + * Then add existing paths from the given list. + * In LIMA the projects list will be limited to the single element "lima" but + * projects depending on LIMA will be able to add their own separate + * configurations. + * + * @param projects The list of project names to explore + * @param paths The list of paths to look into. + */ +LIMA_COMMONTOOLS_EXPORT QStringList buildConfigurationDirectoriesList(const QStringList& projects, + const QStringList& paths = QStringList() ); + +/** + * @brief Build a list of resources directories from a list of project names + * and a list of paths. + * + * For each project name "project", try to add the dir from the environment + * variable $PROJECT_RESOURCES. If it does not exist, try + * $PROJECT_DIST/share/apps/project/resources. If it does not exist either, try + * /usr/share/apps/project/resources. + * Then add existing paths from the given list. + * In LIMA the projects list will be limited to the single element "lima" but + * projects depending on LIMA will be able to add their own separate + * resources. + * + * @param projects The list of project names to explore + * @param paths The list of paths to look into. + */ +LIMA_COMMONTOOLS_EXPORT QStringList buildResourcesDirectoriesList(const QStringList& projects, + const QStringList& paths = QStringList()); + +/** + * Find the given file in the given paths. + * @param paths the list of concatenated paths to search th file in + * @param fileName the name of the file to search into the paths. Can include a relative path + * @param separator the character used to split the list of paths. Defaults to semicolon + * @return the full path of the found file if found. Empty string otherwise. + */ +LIMA_COMMONTOOLS_EXPORT QString findFileInPaths(const QString& paths, const QString& fileName, const QChar& separator = LIMA_PATH_SEPARATOR); + +} // end namespace +} // end namespace +} // end namespace + +#endif diff --git a/lima_common/test/testFsaDict16.cpp b/lima_common/test/testFsaDict16.cpp index 79697b443..d58fd24d5 100644 --- a/lima_common/test/testFsaDict16.cpp +++ b/lima_common/test/testFsaDict16.cpp @@ -1,924 +1,949 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - testFsaDict16.cpp - description - ------------------- - begin : lun jun 2 2003 - copyright : (C) 2003 by Olivier Mesnard - email : olivier.mesnard@cea.fr -// ***************************************************************************/ - -/*************************************************************************** - * * - * compact dictionnary based on finite state automata * - * implemented with Boost Graph library * - * * - ***************************************************************************/ -#include "common/LimaCommon.h" - -#include "common/time/traceUtils.h" - -// string and file handling Utilities -#include "common/Data/strwstrtools.h" - -// dictionaries -#include "common/FsaAccess/FsaAccessBuilder16.h" -#include "common/FsaAccess/FsaAccessBuilderRandom16.h" -#include "common/FsaAccess/FsaAccessSpare16.h" -#include "common/misc/AbstractAccessByString.h" - -#include - -// for set locale -#include -// for system() -#include - -#include -#include -#include -#include -#include -#include -#include - -// For ::stat() function -#include -#include -#ifndef WIN32 -#include -#endif -using namespace std; -using namespace Lima; -using namespace Lima::Common; - -int logFileSize( const std::string& filename ) { - struct stat sts; - if( stat( filename.c_str(), &sts) != 0) - std::cerr << "logFileSize: error getting info for file " << filename << std::endl; - std::cout << "taille fichier: " << filename << "= " << sts.st_size << std::endl; - return sts.st_size; -} - -void logMemsize( const string& legend ) { -#ifdef WIN32 - LIMA_UNUSED(legend); -#else - pid_t pid = getpid(); - ostringstream ostr; - ostr << "/proc/" << pid << "/status"; - ifstream statusFile(ostr.str().c_str(), std::ifstream::binary); - char strbuff[200]; - for( ; ; ) { - string status; - statusFile.getline(strbuff, 200, '\n' ); - string line(strbuff); - if(line.empty() ) - break; - string::size_type composed1_pos = line.find("VmSize:"); - if( composed1_pos != string::npos ) { - string vmSizeStr(line, composed1_pos+7); - int vmSize = atoi(vmSizeStr.c_str()); - std::cerr << legend << " VmSize:" << vmSize; - } - } -#endif -} - -int getProcStat( const std::string& toLog ) { -#ifdef WIN32 - LIMA_UNUSED(toLog); - return 0; -#else - std::string statusFile; - - ostringstream os; - os << "/proc/" << getpid() << "/status"; - statusFile=os.str(); - - ifstream statusIn(statusFile.c_str(),ios::in | std::ifstream::binary); - string line; - int val; - while (!statusIn.eof()) - { - getline(statusIn,line); -// std::cout << "line = " << line << std::endl; - size_t index=line.find(toLog); - if( index != std::string::npos ) { -// std::cout << "index = " << index << std::endl; - string valstr=line.substr(index+toLog.size()+1); -// std::cout << "valstr = " << valstr << std::endl; - val = atoi(valstr.c_str()); - std::cout << toLog << "=" << val < & listOfWords ) - { - std::ifstream wList(listOfWordsFilename.c_str(), std::ios::in | std::ios::binary ); - if ( !wList.is_open() ) { - std::cerr << "Cannot open list of words " << listOfWordsFilename << std::endl; - return EXIT_FAILURE; - } - std::cerr << "Read list of words" << std::endl; - char strbuff[200]; - - for( int counter = 0 ; ; counter++ ) { - // lecture d'une ligne du fichier - wList.getline(strbuff, 200, '\n' ); - string line(strbuff); - if( line.size() == 0 ) { - std::cerr << "end of list of words. counter=" << counter << std::endl; - break; - } - else { - // extraction cha�e - Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); - listOfWords.push_back(word); - } - } - return EXIT_SUCCESS; -} - - -template -class DictTester { - public: - DictTester(Param param, dictType &dico) : m_param(param), m_dico(dico) { - } - void exec( void ); - void testSub(std::vector& hyperwords, - std::vector& offsets, - std::vector > &subwords, bool withAssert ); - void testSuper(typename std::vector::const_iterator begin, - typename std::vector::const_iterator end ); - void testIndex( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ); - void testSpelling( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ); - void addListOfWords(); - void addListOfUnorderedWords(); - void write( void ); - private: - Param m_param; - dictType &m_dico; -}; - - -template -void DictTester::addListOfWords() { - - if( !m_param.listOfWords.size() ) - return; - - std::ifstream wList(m_param.listOfWords.c_str(), std::ios::in | std::ios::binary ); - if ( !wList.is_open() ) { - std::cerr << "Cannot open list of words " << m_param.listOfWords << std::endl; - return; - } - std::cerr << "Read list of words" << std::endl; - char strbuff[200]; - - for( int counter = 0 ; ; counter++ ) { - if( (counter%10000) == 0 ) { - ostringstream ostr; - ostr << "\naddListOfWords counter = " << counter; -// std::cerr << "addListOfWords counter = " << counter << std::endl; - logMemsize( ostr.str() ); - } - // lecture d'une ligne du fichier - wList.getline(strbuff, 200, '\n' ); - string line(strbuff); - if( wList.eof() ) - { - std::cerr << "end of list of words. counter=" << counter << std::endl; - break; - } - else if (!line.empty()) - { -// std::cerr << "addListOfWords: (" << line << ")" << std::endl; - Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); -// std::cerr << "addListOfWords: addWord(" << word << ")" << std::endl; - m_dico.addWord( word ); - } - } - std::cerr << std::endl; - m_dico.pack(); -} - -template -void DictTester::addListOfUnorderedWords() { - - if( m_param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - m_dico.printGraph(std::cerr); - } - - if( !m_param.listOfWords.compare(std::string("")) ) - return; - - std::vector listOfWords; - readListOfWords(m_param.listOfWords, listOfWords); - - int counter(0); - for( std::vector::iterator itWord = listOfWords.begin() ; - itWord != listOfWords.end() ; itWord++, counter++ ) { -// if( (counter%10000) == 0 ) { - std::cerr << "addListOfWords(" << *itWord << "), counter = " << counter << std::endl; -// } - m_dico.addRandomWord( *itWord ); - } - - if( m_param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - m_dico.printGraph(std::cerr); - } -// m_dico.pack(); -} - -template -void DictTester::testIndex( - typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ) { - std::cout << "testIndex: getSize() = " << m_dico.getSize() << std::endl; - - std::vector::const_iterator indexItr = indexes.begin(); - int index0 = 0; - - for( typename std::vector::const_iterator lemma = begin ; - lemma != end ; lemma++ ) { - // recup�ation de l'index �partir de la cha�e de caract�es - int index = m_dico.getIndex(*lemma); - // traces - if( index%10000 == 0 ) { - ostringstream ostr; - ostr << "testIndex index = " << index; -// std::cerr << "addListOfWords counter = " << counter << std::endl; - logMemsize( ostr.str() ); - } - if( m_param.withDebug ) { - Lima::LimaString newWord = *lemma; - std::cout << "testIndex: getIndex(" - << Lima::Common::Misc::limastring2utf8stdstring(newWord) - << ")=" << index << std::endl; - } - else { - if( index%10000 == 1 ) { - Lima::LimaString newWord = *lemma; - std::cout << "testIndex: getIndex(" << Lima::Common::Misc::limastring2utf8stdstring(newWord) - << ")=" << index << std::endl; - } - } - // result verification - if( m_param.withAssert ) { - if( indexItr != indexes.end() ) { -// std::cerr << "check " << index << "!=" << *indexItr << std::endl; - assert( index == *indexItr); - indexItr++; - } - else { -// std::cerr << "check " << index << "!=" << index0+1 << std::endl; - assert( index == index0+1 ); - index0 = index; - } - } - } - - // test sur chaine n'existant pas - for( typename std::vector::const_iterator lemma = begin ; - lemma != end ; lemma++ ) { - int index = m_dico.getIndex(*lemma); - Lima::LimaString invertedLemma; - for( int i = (*lemma).size()-1; i >= 0 ; i-- ) { - invertedLemma.push_back((*lemma)[i]); - } - int invertedIndex = m_dico.getIndex(invertedLemma); - // traces - if( index%10000 == 0 ) { - ostringstream ostr; - ostr << "testIndex inverted (" - << Lima::Common::Misc::limastring2utf8stdstring(invertedLemma) - << ") index = " << invertedIndex; - logMemsize( ostr.str() ); - } - } -} - -template - void DictTester::testSpelling( typename std::vector::const_iterator begin, - typename std::vector::const_iterator end, - const std::vector& indexes ) -//void DictTester::testSpelling( int *indexVal, int nbIndex ) -{ - LIMA_UNUSED(end); - typename std::vector::const_iterator lemma = begin; - - // if size of indexes = 1, we just display the string return by getSpelling() - std::cout << "testSpelling: getSpelling: indexes.size()=" << indexes.size() << std::endl; - if( indexes.size() == 1 ) { - Lima::LimaString spelling; - spelling = m_dico.getSpelling(indexes[0]); - std::cout << "testSpelling: getSpelling(" << indexes[0] - << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; - } - // for each id, compare result of getSpelling with element in vector of string [begin,end] - for( uint32_t i = 0 ; i < indexes.size() ; i++ ) { - Lima::LimaString spelling; - try{ - spelling = m_dico.getSpelling(indexes[i]); - if( i%10000 == 1 ) { - std::cout << "testSpelling: getSpelling(" << indexes[i] - << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; - } - if( m_param.withAssert ) { - assert( spelling == (*lemma) ); - } - } - catch(std::logic_error e ) { - std::cout << "testSpelling exception: " << e.what() << std::endl; - } - lemma++; - } -} - -template -void DictTester::testSuper( - typename std::vector::const_iterator begin, - typename std::vector::const_iterator end ) { - - for( typename std::vector::const_iterator it = begin ; - it != end ; it++ ) { - try{ - Lima::LimaString prefix = *it; - std::pair entries = - m_dico.getSuperWords(prefix); - std::cout << "testSuper: getSuperWords(" - << Lima::Common::Misc::limastring2utf8stdstring(prefix) - << ")" << std::endl; - for( ; entries.first != entries.second ; entries.first++ ) { - Lima::LimaString superWord = *(entries.first); - std::cout << Lima::Common::Misc::limastring2utf8stdstring(superWord) - << ", " << std::endl; - } - std::cout << std::endl; - } - catch(std::logic_error e ) { - std::cout << "testSuper: getSuperWords exception: " << e.what() << std::endl; - } - } -} - -template - void DictTester::testSub( - std::vector & hyperwords, - std::vector & offsets, - std::vector >& subwords, bool withAssert ) { - - typename std::vector::iterator wordIt; - std::vector::iterator offsetIt = offsets.begin(); - typename std::vector >::iterator answersIt = subwords.begin(); - for( wordIt = hyperwords.begin(); wordIt != hyperwords.end() ; wordIt++ ) { - try{ - Lima::LimaString word = *wordIt; - std::pair entries = m_dico.getSubWords(*offsetIt,word); - FSAALOGINIT; - LDEBUG << "test getSubWords(" - << ", " << word << ")" ; - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - LINFO << "string(" << *offsetIt << "," << (*entry).first << "), "; - } - LINFO ; - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); - LINFO << subWord << ", "; - } - LINFO ; - if( withAssert ) { - // r�up�ation des r�onses attendues pour v�ifications - assert( answersIt != subwords.end() ); - std::vector answers = *(answersIt++); - typename std::vector::iterator answerIt = answers.begin(); - for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { - assert( answerIt != answers.end() ); - Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); - assert(!subWord.compare(*answerIt)); - answerIt++; - } - } - } - catch(std::logic_error e ) { - std::cout << "testSub: getSubWords exception: " << e.what() << std::endl; - } - offsetIt++; - } -} - -template -void DictTester::exec( void ) { - if( m_param.withDebug ) { - std::cerr << "Print dictionary...." << std::endl; - m_dico.print(std::cout); - } -} - -template -void DictTester::write( void ) { - try { - if( m_param.outputDico.size() > 0 ) { - std::cerr << "Write dictionary...." << std::endl; - m_dico.write(m_param.outputDico); - } - } - catch(LimaException e ) { - std::cout << "write: exception: " << e.what() << std::endl; - } -} - -int main(int argc, char *argv[]) -{ - QCoreApplication a(argc, argv); - QsLogging::initQsLog(); - - cerr << argv[0] << " begin..." << endl << " command line: "; - for (int i = 0; i < argc; i++) - { - std::cerr << argv[i] << " "; - } - std::cerr << std::endl; - - setlocale(LC_ALL, ""); -#ifdef DEBUG_CD - FSAALOGINIT; - LDEBUG << argv[0] << " begin..." ; -#endif - - // options reading - Param param = { - std::string(), // listOfWords - std::string(), // outputDico - std::string(), // inputDico - false, // subWord - std::string(), // listOfHyperwords - false, // superWord - false, // printGraph - false, // spareMem - one_byte, // charSize - false, // withoutTemplate - true, // trieDirectionForward - false, // withDebug - false, // runPerfo - false, // runIndex - false, // addWord - false, // runSpelling - -1, // termId (-1 means no termId specified by user) - false, // composed - false, // withAssert - std::string() // inputDico - }; - - for (int i = 1 ; i < argc; i++) { - QString arg = QString::fromUtf8(argv[i]); - int pos = -1; - if (arg == "--help") - { - std::cerr << "usage: " << argv[0] - << " --help" << std::endl; - std::cerr << " " << argv[0] - << " [--output=]" - << " [--input=]" - << " [--printGraph]" - << " [--subWord]" - << " [--listOfHyperwords=]" - << " [--listOfWords=]" - << " [--superWord]" - << " [--spare]" - << " [--runIndex]" - << " [--addWord]" - << " [--runSpelling]" - << " [--termId=nn" - << " [--composed=]" - << " [--charSize=<1|2|4>]" - << " [--withoutTemplate" - << " [--reverse]" - << " [--withDebug]" - << " [--runPerfo]" - << " [--withAssert]" - << std::endl; - return 0; - } - else if ( (pos = arg.indexOf("--input=")) != -1 ){ - param.inputDico = arg.mid(pos+8).toUtf8().data(); - } - else if ( (pos = arg.indexOf("--output=")) != -1 ){ - param.outputDico = arg.mid(pos+9).toUtf8().data(); - } - else if ( arg =="--printGraph" ){ - param.printGraph = true; - } - else if ( arg == "--subWord" ){ - param.subWord = true; - } - else if ( (pos = arg.indexOf("--listOfHyperwords=")) != -1 ){ - param.listOfHyperwords = arg.mid(pos+19).toUtf8().data(); - } - else if ( (pos = arg.indexOf("--listOfWords=")) != -1 ){ - param.listOfWords = arg.mid(pos+14).toUtf8().data(); - } - else if ( arg == "--superWord" ){ - param.superWord = true; - } - else if ( arg == "--withDebug" ){ - param.withDebug = true; - } - else if ( arg == "--runPerfo" ){ - param.runPerfo = true; - } - else if ( arg == "--withoutTemplate" ){ - param.withoutTemplate = true; - } - else if ( (pos = arg.indexOf("--charSize=")) != -1 ){ - int charSize = (arg.mid(pos+11)).toInt(); - switch(charSize) { - case 1: - param.charSize = one_byte; - break; - case 2: - param.charSize = two_bytes; - break; - case 4: - param.charSize = four_bytes; - break; - } - } - else if ( arg == "--spare" ){ - param.spareMem = true; - } - else if ( arg == "--runIndex" ){ - param.runIndex = true; - } - else if ( arg == "--addWord" ){ - param.addWord = true; - } - else if ( arg == "--runSpelling" ){ - param.runSpelling = true; - } - else if ( (pos = arg.indexOf("--termId=")) != -1 ){ - param.termId = (arg.mid(pos+9)).toInt(); - } - else if ( arg == "--reverse" ){ - param.trieDirectionForward = false; - } - else if ( (pos = arg.indexOf("--composed=")) != -1 ){ - param.composed = true; - param.inputDicoComp = arg.mid(pos+12).toUtf8().data(); - } - else if ( arg == "--withAssert" ){ - param.withAssert = true; - } - } - - cerr << argv[0] << ": "; - if(param.withDebug) - cerr << "--withDebug "; - if(param.runPerfo) - cerr << "--runPerfo "; - if(param.spareMem) - cerr << "--spare "; - if(param.runIndex) - cerr << "--runIndex "; - if(param.addWord) - cerr << "--addWord "; - if(param.runSpelling) - cerr << "--runSpelling "; - if(param.printGraph) - cerr << "--printGraph "; - if(!param.trieDirectionForward) - cerr << "--reverse "; - if(!param.withoutTemplate) - cerr << "--withoutTemplate "; - if(param.subWord) { - cerr << "--subWord "; - if(param.listOfHyperwords.size()){ - cerr << "--listOfHyperwords=" << param.listOfHyperwords << " "; - } - } - if(param.composed) - cerr << "--composed=" << param.inputDicoComp << " "; - cerr << "--charSize=" << param.charSize; - if(param.inputDico.size()) { - cerr << "--input='" << param.inputDico << "' "; - } - if(param.outputDico.size()) { - cerr << "--output='" << param.outputDico << "' "; - } - if(param.listOfWords.size()) { - cerr << "--listOfWords='" << param.listOfWords << "'"; - } - cerr << endl; - - DictTester *wspareTester16=0; - DictTester *wbuilderTester16=0; - DictTester *wbuilderRandomTester16=0; - - if( (!param.spareMem) && (param.addWord) ) { - // Si Builder avec option addWord: BuilderRandom - std::cerr << "Create BuilderRandom dictionary...." << std::endl; - Lima::Common::FsaAccess::FsaAccessBuilderRandom16 *dico=0; - if(param.trieDirectionForward) { - dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(); - } - else { - dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(false); - } - if( param.inputDico.size() > 0) { - std::cerr << "Read dictionary from file... " - << param.inputDico << "..." << std::endl; - dico->read(param.inputDico); - } - wbuilderRandomTester16 = new - DictTester( param, *dico ); - if( param.listOfWords.size() > 0 ) { - std::cerr << "addListOfRandomWords " - << param.listOfWords << "..." << std::endl; - wbuilderRandomTester16->addListOfUnorderedWords(); - } - wbuilderRandomTester16->exec(); - wbuilderRandomTester16->write(); - } - - else if ( !param.spareMem) { - // Si Builder sans option addWord: Builder - std::cerr << "Create dictionary...." << std::endl; - Lima::Common::FsaAccess::FsaAccessBuilder16 *dico=0; - if(param.trieDirectionForward) { - dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(); - } - else { - dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(false); - } - if( param.inputDico.size() > 0) { - std::cerr << "no read operation allowed for FsaAccessBuilder " - << std::endl; - return EXIT_FAILURE; - } - - wbuilderTester16 = new - DictTester( param, *dico ); - if( param.listOfWords.size() > 0 ) { - std::cerr << "addListOfWords " - << param.listOfWords << "..." << std::endl; - wbuilderTester16->addListOfWords(); - } - wbuilderTester16->exec(); - wbuilderTester16->write(); - } - else { - int refSize = 1; - int memSize = 0; - int memSize0 = 0; - if( param.runPerfo ) { - refSize = logFileSize( param.listOfWords ); - logFileSize( param.inputDico ); - memSize0 = getProcStat( std::string("VmSize") ); - std::cout << "procSize before load dico = " << memSize0 << std::endl; - TimeUtils::updateCurrentTime(); - } - Lima::Common::FsaAccess::FsaAccessSpare16 *dico = - new Lima::Common::FsaAccess::FsaAccessSpare16(); - dico->read(param.inputDico); - if( param.runPerfo ) { - TimeUtils::logElapsedTime("load dico"); - memSize = getProcStat( std::string("VmSize") ); - std::cout << "procSize after load dico = " << memSize << std::endl; - std::cout << "dico size in mem = " << memSize - memSize0 << std::endl; - std::cout << "compression rate = " << ((memSize - memSize0)*102400.0)/refSize << "%" << std::endl; - } - if( param.printGraph ) { - std::cerr << "Print graph...." << std::endl; - dico->printGraph(std::cerr); - } - wspareTester16 = new - DictTester( - param, *dico ); - - -/* - Lima::LimaString lcwlem0(Misc::utf8stdstring2limastring("b")); - std::cerr << "lcwlem0=" << lcwlem0 << std::endl; - Lima::LimaString & stlem0 = lcwlem0; - Lima::LimaString & stlem1 = lcwlem1; - Lima::LimaString & stlem2 = lcwlem2; - Lima::LimaString & stlem3 = lcwlem3; - Lima::LimaString & stlem4 = lcwlem4; - Lima::LimaString & stlem5 = lcwlem5; - Lima::LimaString & stlem6 = lcwlem6; - Lima::LimaString & stlem7 = lcwlem7; -*/ - if( param.runIndex ) { - std::cerr << "runIndex" << std::endl; - std::vector listOfWords; - std::vector indexes; - - if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - } - else { - Lima::LimaString lcwlem1(Misc::utf8stdstring2limastring("béc")); - Lima::LimaString lcwlem2(Misc::utf8stdstring2limastring("séc")); - Lima::LimaString lcwlem3(Misc::utf8stdstring2limastring("sél")); - Lima::LimaString lcwlem4(Misc::utf8stdstring2limastring("sé")); - Lima::LimaString lcwlem5(Misc::utf8stdstring2limastring("s")); - Lima::LimaString lcwlem6(Misc::utf8stdstring2limastring("truc")); - Lima::LimaString lcwlem7(Misc::utf8stdstring2limastring("table")); - listOfWords.push_back( Lima::LimaString(lcwlem1) ); - indexes.push_back(1); - listOfWords.push_back( Lima::LimaString(lcwlem2) ); - indexes.push_back(2); - listOfWords.push_back( Lima::LimaString(lcwlem3) ); - indexes.push_back(3); - listOfWords.push_back( Lima::LimaString(lcwlem4) ); - indexes.push_back(4); - listOfWords.push_back( Lima::LimaString(lcwlem5) ); - indexes.push_back(-1); - listOfWords.push_back( Lima::LimaString(lcwlem6) ); - indexes.push_back(-1); - listOfWords.push_back( Lima::LimaString(lcwlem7) ); - indexes.push_back(-1); - }; - std::cerr << "testIndex" << std::endl; -// for( int i = 10 ; i > 0 ; i-- ) - TimeUtils::updateCurrentTime(); - wspareTester16->testIndex(listOfWords.begin(), listOfWords.end(), indexes ); - uint64_t elapsed = TimeUtils::elapsedTime(); - TimeUtils::logElapsedTime("testIndex"); - std::cout << "key average size = " << (refSize*1.0)/dico->getSize() << " byte" << std::endl; - std::cout << "testIndex: average time = " << (elapsed*1000.0)/dico->getSize() << std::endl; - } - - if( param.runSpelling ) { - std::vector listOfWords; - std::vector indexes; - - // case 1: ask for spelling of a word given a termId - if( param.termId > 0 ) { - indexes.push_back(param.termId); - std::cerr << "testSpelling with unique termId " << indexes[0] << std::endl; - } - // case 2: check if getSpelling is ok for every id - // (listOfWords is supposed to contain the complete ordered list of terms - else if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - int index = 1; - for( std::vector::const_iterator it = listOfWords.begin() ; - it != listOfWords.end() ; it++ ) { - indexes.push_back(index++); - std::cerr << "testSpelling with list of " << indexes.size() << " words" << std::endl; - } - } - wspareTester16->testSpelling(listOfWords.begin(), listOfWords.end(), indexes ); - } - if( param.superWord) { - std::cerr << "runSuper" << std::endl; - std::vector listOfWords; - Lima::LimaString vide; - listOfWords.push_back(vide); - - if( param.listOfWords.size() > 0 ) { - readListOfWords(param.listOfWords, listOfWords ); - } - wspareTester16->testSuper(listOfWords.begin(), listOfWords.end()); - } - - wspareTester16->exec(); - if( param.subWord) { - // cha�e �d�ouper - std::vector hyperwords; - // offset de localisation de l'hypermot dans la cha�e - std::vector offsets; - // r�onses du dictionnaire sur l'appel �getSubword - std::vector > subwords; - if( param.listOfHyperwords.size() > 0 ) { - std::ifstream Hlist(param.listOfHyperwords.c_str(), std::ios::in | std::ios::binary ); - if ( !Hlist.is_open() ) { - std::cerr << "Cannot open list of (hyperword,offset..) " << param.listOfHyperwords << std::endl; - return EXIT_FAILURE; - } - std::cerr << "Read hyperword and offset...." << std::endl; - std::string line; - - for( int counter = 0 ; ; counter++ ) { - // lecture d'une ligne du fichier de test - line = Lima::Common::Misc::readLine(Hlist); - if( line.size() == 0 ) { - std::cerr << "end of list of (hyperword,offset)." << std::endl; - break; - } - else { - // extraction chaine a decouper - std::string::size_type hyperword_pos = line.find(';'); - std::string utf8_hyperword(line, 0, hyperword_pos); - Lima::LimaString hyperword = Lima::Common::Misc::utf8stdstring2limastring(utf8_hyperword); - hyperwords.push_back(hyperword); - std::cerr << "push(" << hyperword; -// std::cerr << "offset=" << hyperword_pos << std::endl; - // extraction offset - std::string::size_type offset_pos = line.find(';', hyperword_pos+1); - std::string offset_str(line, hyperword_pos+1, offset_pos-(hyperword_pos+1)); - int offset = std::atoi(offset_str.c_str()); - offsets.push_back(offset); - std::cerr << "," << offset; -// std::cerr << "offset=" << offset_pos << std::endl; - // extraction liste de r�onses attendues - std::vector answers; - std::string::size_type subword_pos0 = offset_pos; - std::string::size_type subword_pos = line.find(';', subword_pos0+1); - for( ; subword_pos != std::string::npos ; subword_pos = line.find(';', subword_pos0+1) ) { - std::string utf8_answer(line, subword_pos0+1, subword_pos-(subword_pos0+1)); - Lima::LimaString answer = Lima::Common::Misc::utf8stdstring2limastring(utf8_answer); - answers.push_back(answer); - std::cerr << "," << answer; -// std::cerr << "offset=" << subword_pos << std::endl; - subword_pos0 = subword_pos; - } - subwords.push_back(answers); - std::cerr << ")" << std::endl; - } - } - } - else { - Lima::LimaString lcwhyper1(Misc::utf8stdstring2limastring("séc")); - Lima::LimaString lcwhyper2(Misc::utf8stdstring2limastring("abcséc")); - Lima::LimaString lcwhyper3(Misc::utf8stdstring2limastring("truc")); - Lima::LimaString & stlem1 = lcwhyper1; - Lima::LimaString & stlem2 = lcwhyper2; - Lima::LimaString & stlem3 = lcwhyper3; - - hyperwords.push_back(Lima::LimaString(stlem1)); // s� - offsets.push_back(0); - hyperwords.push_back(Lima::LimaString(stlem2)); // abcs� - offsets.push_back(3); - hyperwords.push_back(Lima::LimaString(stlem3)); // truc - offsets.push_back(0); - } - wspareTester16->testSub(hyperwords, offsets, subwords, param.withAssert); - } -// wspareTester16->write(); - } - - return EXIT_SUCCESS; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + testFsaDict16.cpp - description + ------------------- + begin : lun jun 2 2003 + copyright : (C) 2003 by Olivier Mesnard + email : olivier.mesnard@cea.fr +// ***************************************************************************/ + +/*************************************************************************** + * * + * compact dictionnary based on finite state automata * + * implemented with Boost Graph library * + * * + ***************************************************************************/ +#include "common/LimaCommon.h" + +#include "common/time/traceUtils.h" + +// string and file handling Utilities +#include "common/Data/strwstrtools.h" + +// dictionaries +#include "common/FsaAccess/FsaAccessBuilder16.h" +#include "common/FsaAccess/FsaAccessBuilderRandom16.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/misc/AbstractAccessByString.h" + +#include + +// for set locale +#include +// for system() +#include + +#include +#include +#include +#include +#include +#include +#include + +// For ::stat() function +#include +#include +#ifndef WIN32 +#include +#endif +#ifdef ANTINNO_SPECIFIC +// FWI 18/02/2014 : ajout 2 undef +#ifdef WIN32 +#undef max +#undef min +#endif +#endif + +using namespace std; +using namespace Lima; +using namespace Lima::Common; + +int logFileSize( const std::string& filename ) { + struct stat sts; + if( stat( filename.c_str(), &sts) != 0) + std::cerr << "logFileSize: error getting info for file " << filename << std::endl; + std::cout << "taille fichier: " << filename << "= " << sts.st_size << std::endl; + return sts.st_size; +} + +void logMemsize( const string& legend ) { +#ifdef WIN32 + LIMA_UNUSED(legend); +#else + pid_t pid = getpid(); + ostringstream ostr; + ostr << "/proc/" << pid << "/status"; + ifstream statusFile(ostr.str().c_str(), std::ifstream::binary); + char strbuff[200]; + for( ; ; ) { + string status; + statusFile.getline(strbuff, 200, '\n' ); + string line(strbuff); + if(line.empty() ) + break; + string::size_type composed1_pos = line.find("VmSize:"); + if( composed1_pos != string::npos ) { + string vmSizeStr(line, composed1_pos+7); + int vmSize = atoi(vmSizeStr.c_str()); + std::cerr << legend << " VmSize:" << vmSize; + } + } +#endif +} + +int getProcStat( const std::string& toLog ) { +#ifdef WIN32 + LIMA_UNUSED(toLog); + return 0; +#else + std::string statusFile; + + ostringstream os; + os << "/proc/" << getpid() << "/status"; + statusFile=os.str(); + + ifstream statusIn(statusFile.c_str(),ios::in | std::ifstream::binary); + string line; + int val; + while (!statusIn.eof()) + { + getline(statusIn,line); +// std::cout << "line = " << line << std::endl; + size_t index=line.find(toLog); + if( index != std::string::npos ) { +// std::cout << "index = " << index << std::endl; + string valstr=line.substr(index+toLog.size()+1); +// std::cout << "valstr = " << valstr << std::endl; + val = atoi(valstr.c_str()); + std::cout << toLog << "=" << val < & listOfWords ) + { + std::ifstream wList(listOfWordsFilename.c_str(), std::ios::in | std::ios::binary ); + if ( !wList.is_open() ) { + std::cerr << "Cannot open list of words " << listOfWordsFilename << std::endl; + return EXIT_FAILURE; + } + std::cerr << "Read list of words" << std::endl; + char strbuff[200]; + + for( int counter = 0 ; ; counter++ ) { + // lecture d'une ligne du fichier + wList.getline(strbuff, 200, '\n' ); + string line(strbuff); + if( line.size() == 0 ) { + std::cerr << "end of list of words. counter=" << counter << std::endl; + break; + } + else { + // extraction cha�e + Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); + listOfWords.push_back(word); + } + } + return EXIT_SUCCESS; +} + + +template +class DictTester { + public: + DictTester(Param param, dictType &dico) : m_param(param), m_dico(dico) { + } + void exec( void ); + void testSub(std::vector& hyperwords, + std::vector& offsets, + std::vector > &subwords, bool withAssert ); + void testSuper(typename std::vector::const_iterator begin, + typename std::vector::const_iterator end ); + void testIndex( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ); + void testSpelling( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ); + void addListOfWords(); + void addListOfUnorderedWords(); + void write( void ); + private: + Param m_param; + dictType &m_dico; +}; + + +template +void DictTester::addListOfWords() { + + if( !m_param.listOfWords.size() ) + return; + + std::ifstream wList(m_param.listOfWords.c_str(), std::ios::in | std::ios::binary ); + if ( !wList.is_open() ) { + std::cerr << "Cannot open list of words " << m_param.listOfWords << std::endl; + return; + } + std::cerr << "Read list of words" << std::endl; + char strbuff[200]; + + for( int counter = 0 ; ; counter++ ) { + if( (counter%10000) == 0 ) { + ostringstream ostr; + ostr << "\naddListOfWords counter = " << counter; +// std::cerr << "addListOfWords counter = " << counter << std::endl; + logMemsize( ostr.str() ); + } + // lecture d'une ligne du fichier + wList.getline(strbuff, 200, '\n' ); + string line(strbuff); + if( wList.eof() ) + { + std::cerr << "end of list of words. counter=" << counter << std::endl; + break; + } + else if (!line.empty()) + { +// std::cerr << "addListOfWords: (" << line << ")" << std::endl; + Lima::LimaString word = Lima::Common::Misc::utf8stdstring2limastring(line); +// std::cerr << "addListOfWords: addWord(" << word << ")" << std::endl; + m_dico.addWord( word ); + } + } + std::cerr << std::endl; + m_dico.pack(); +} + +template +void DictTester::addListOfUnorderedWords() { + + if( m_param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + m_dico.printGraph(std::cerr); + } + + if( !m_param.listOfWords.compare(std::string("")) ) + return; + + std::vector listOfWords; + readListOfWords(m_param.listOfWords, listOfWords); + + int counter(0); + for( std::vector::iterator itWord = listOfWords.begin() ; + itWord != listOfWords.end() ; itWord++, counter++ ) { +// if( (counter%10000) == 0 ) { + std::cerr << "addListOfWords(" << *itWord << "), counter = " << counter << std::endl; +// } + m_dico.addRandomWord( *itWord ); + } + + if( m_param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + m_dico.printGraph(std::cerr); + } +// m_dico.pack(); +} + +template +void DictTester::testIndex( + typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ) { + std::cout << "testIndex: getSize() = " << m_dico.getSize() << std::endl; + + std::vector::const_iterator indexItr = indexes.begin(); + int index0 = 0; + + for( typename std::vector::const_iterator lemma = begin ; + lemma != end ; lemma++ ) { + // recup�ation de l'index �partir de la cha�e de caract�es + int index = m_dico.getIndex(*lemma); + // traces + if( index%10000 == 0 ) { + ostringstream ostr; + ostr << "testIndex index = " << index; +// std::cerr << "addListOfWords counter = " << counter << std::endl; + logMemsize( ostr.str() ); + } + if( m_param.withDebug ) { + Lima::LimaString newWord = *lemma; + std::cout << "testIndex: getIndex(" + << Lima::Common::Misc::limastring2utf8stdstring(newWord) + << ")=" << index << std::endl; + } + else { + if( index%10000 == 1 ) { + Lima::LimaString newWord = *lemma; + std::cout << "testIndex: getIndex(" << Lima::Common::Misc::limastring2utf8stdstring(newWord) + << ")=" << index << std::endl; + } + } + // result verification + if( m_param.withAssert ) { + if( indexItr != indexes.end() ) { +// std::cerr << "check " << index << "!=" << *indexItr << std::endl; + assert( index == *indexItr); + indexItr++; + } + else { +// std::cerr << "check " << index << "!=" << index0+1 << std::endl; + assert( index == index0+1 ); + index0 = index; + } + } + } + + // test sur chaine n'existant pas + for( typename std::vector::const_iterator lemma = begin ; + lemma != end ; lemma++ ) { + int index = m_dico.getIndex(*lemma); + Lima::LimaString invertedLemma; + for( int i = (*lemma).size()-1; i >= 0 ; i-- ) { + invertedLemma.push_back((*lemma)[i]); + } + int invertedIndex = m_dico.getIndex(invertedLemma); + // traces + if( index%10000 == 0 ) { + ostringstream ostr; + ostr << "testIndex inverted (" + << Lima::Common::Misc::limastring2utf8stdstring(invertedLemma) + << ") index = " << invertedIndex; + logMemsize( ostr.str() ); + } + } +} + +template + void DictTester::testSpelling( typename std::vector::const_iterator begin, + typename std::vector::const_iterator end, + const std::vector& indexes ) +//void DictTester::testSpelling( int *indexVal, int nbIndex ) +{ + LIMA_UNUSED(end); + typename std::vector::const_iterator lemma = begin; + + // if size of indexes = 1, we just display the string return by getSpelling() + std::cout << "testSpelling: getSpelling: indexes.size()=" << indexes.size() << std::endl; + if( indexes.size() == 1 ) { + Lima::LimaString spelling; + spelling = m_dico.getSpelling(indexes[0]); + std::cout << "testSpelling: getSpelling(" << indexes[0] + << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; + } + // for each id, compare result of getSpelling with element in vector of string [begin,end] + for( uint32_t i = 0 ; i < indexes.size() ; i++ ) { + Lima::LimaString spelling; + try{ + spelling = m_dico.getSpelling(indexes[i]); + if( i%10000 == 1 ) { + std::cout << "testSpelling: getSpelling(" << indexes[i] + << ")=" << Lima::Common::Misc::limastring2utf8stdstring(spelling) << std::endl; + } + if( m_param.withAssert ) { + assert( spelling == (*lemma) ); + } + } + catch(std::logic_error e ) { + std::cout << "testSpelling exception: " << e.what() << std::endl; + } + lemma++; + } +} + +template +void DictTester::testSuper( + typename std::vector::const_iterator begin, + typename std::vector::const_iterator end ) { + + for( typename std::vector::const_iterator it = begin ; + it != end ; it++ ) { + try{ + Lima::LimaString prefix = *it; + std::pair entries = + m_dico.getSuperWords(prefix); + std::cout << "testSuper: getSuperWords(" + << Lima::Common::Misc::limastring2utf8stdstring(prefix) + << ")" << std::endl; + for( ; entries.first != entries.second ; entries.first++ ) { + Lima::LimaString superWord = *(entries.first); + std::cout << Lima::Common::Misc::limastring2utf8stdstring(superWord) + << ", " << std::endl; + } + std::cout << std::endl; + } + catch(std::logic_error e ) { + std::cout << "testSuper: getSuperWords exception: " << e.what() << std::endl; + } + } +} + +template + void DictTester::testSub( + std::vector & hyperwords, + std::vector & offsets, + std::vector >& subwords, bool withAssert ) { + + typename std::vector::iterator wordIt; + std::vector::iterator offsetIt = offsets.begin(); + typename std::vector >::iterator answersIt = subwords.begin(); + for( wordIt = hyperwords.begin(); wordIt != hyperwords.end() ; wordIt++ ) { + try{ + Lima::LimaString word = *wordIt; + std::pair entries = m_dico.getSubWords(*offsetIt,word); + FSAALOGINIT; + LDEBUG << "test getSubWords(" + << ", " << word << ")" ; + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + LINFO << "string(" << *offsetIt << "," << (*entry).first << "), "; + } + LINFO ; + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); + LINFO << subWord << ", "; + } + LINFO ; + if( withAssert ) { + // r�up�ation des r�onses attendues pour v�ifications + assert( answersIt != subwords.end() ); + std::vector answers = *(answersIt++); + typename std::vector::iterator answerIt = answers.begin(); + for( AccessSubWordIterator entry = entries.first ; entry != entries.second ; entry++ ) { + assert( answerIt != answers.end() ); + Lima::LimaString subWord = word.mid(*offsetIt, (*entry).first - *offsetIt); + assert(!subWord.compare(*answerIt)); + answerIt++; + } + } + } + catch(std::logic_error e ) { + std::cout << "testSub: getSubWords exception: " << e.what() << std::endl; + } + offsetIt++; + } +} + +template +void DictTester::exec( void ) { + if( m_param.withDebug ) { + std::cerr << "Print dictionary...." << std::endl; + m_dico.print(std::cout); + } +} + +template +void DictTester::write( void ) { + try { + if( m_param.outputDico.size() > 0 ) { + std::cerr << "Write dictionary...." << std::endl; + m_dico.write(m_param.outputDico); + } + } + catch(LimaException e ) { + std::cout << "write: exception: " << e.what() << std::endl; + } +} + +int main(int argc, char *argv[]) +{ + QCoreApplication a(argc, argv); +#ifdef ANTINNO_SPECIFIC + { + ::std::string const configDir = ::std::getenv("AMOSE_CONF"); + if (configDir.empty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + QsLogging::antinno::log = pLog1; + } +#else + QsLogging::initQsLog(); +#endif + + cerr << argv[0] << " begin..." << endl << " command line: "; + for (int i = 0; i < argc; i++) + { + std::cerr << argv[i] << " "; + } + std::cerr << std::endl; + + setlocale(LC_ALL, ""); +#ifdef DEBUG_CD + FSAALOGINIT; + LDEBUG << argv[0] << " begin..." ; +#endif + + // options reading + Param param = { + std::string(), // listOfWords + std::string(), // outputDico + std::string(), // inputDico + false, // subWord + std::string(), // listOfHyperwords + false, // superWord + false, // printGraph + false, // spareMem + one_byte, // charSize + false, // withoutTemplate + true, // trieDirectionForward + false, // withDebug + false, // runPerfo + false, // runIndex + false, // addWord + false, // runSpelling + -1, // termId (-1 means no termId specified by user) + false, // composed + false, // withAssert + std::string() // inputDico + }; + + for (int i = 1 ; i < argc; i++) { + QString arg = QString::fromUtf8(argv[i]); + int pos = -1; + if (arg == "--help") + { + std::cerr << "usage: " << argv[0] + << " --help" << std::endl; + std::cerr << " " << argv[0] + << " [--output=]" + << " [--input=]" + << " [--printGraph]" + << " [--subWord]" + << " [--listOfHyperwords=]" + << " [--listOfWords=]" + << " [--superWord]" + << " [--spare]" + << " [--runIndex]" + << " [--addWord]" + << " [--runSpelling]" + << " [--termId=nn" + << " [--composed=]" + << " [--charSize=<1|2|4>]" + << " [--withoutTemplate" + << " [--reverse]" + << " [--withDebug]" + << " [--runPerfo]" + << " [--withAssert]" + << std::endl; + return 0; + } + else if ( (pos = arg.indexOf("--input=")) != -1 ){ + param.inputDico = arg.mid(pos+8).toUtf8().data(); + } + else if ( (pos = arg.indexOf("--output=")) != -1 ){ + param.outputDico = arg.mid(pos+9).toUtf8().data(); + } + else if ( arg =="--printGraph" ){ + param.printGraph = true; + } + else if ( arg == "--subWord" ){ + param.subWord = true; + } + else if ( (pos = arg.indexOf("--listOfHyperwords=")) != -1 ){ + param.listOfHyperwords = arg.mid(pos+19).toUtf8().data(); + } + else if ( (pos = arg.indexOf("--listOfWords=")) != -1 ){ + param.listOfWords = arg.mid(pos+14).toUtf8().data(); + } + else if ( arg == "--superWord" ){ + param.superWord = true; + } + else if ( arg == "--withDebug" ){ + param.withDebug = true; + } + else if ( arg == "--runPerfo" ){ + param.runPerfo = true; + } + else if ( arg == "--withoutTemplate" ){ + param.withoutTemplate = true; + } + else if ( (pos = arg.indexOf("--charSize=")) != -1 ){ + int charSize = (arg.mid(pos+11)).toInt(); + switch(charSize) { + case 1: + param.charSize = one_byte; + break; + case 2: + param.charSize = two_bytes; + break; + case 4: + param.charSize = four_bytes; + break; + } + } + else if ( arg == "--spare" ){ + param.spareMem = true; + } + else if ( arg == "--runIndex" ){ + param.runIndex = true; + } + else if ( arg == "--addWord" ){ + param.addWord = true; + } + else if ( arg == "--runSpelling" ){ + param.runSpelling = true; + } + else if ( (pos = arg.indexOf("--termId=")) != -1 ){ + param.termId = (arg.mid(pos+9)).toInt(); + } + else if ( arg == "--reverse" ){ + param.trieDirectionForward = false; + } + else if ( (pos = arg.indexOf("--composed=")) != -1 ){ + param.composed = true; + param.inputDicoComp = arg.mid(pos+12).toUtf8().data(); + } + else if ( arg == "--withAssert" ){ + param.withAssert = true; + } + } + + cerr << argv[0] << ": "; + if(param.withDebug) + cerr << "--withDebug "; + if(param.runPerfo) + cerr << "--runPerfo "; + if(param.spareMem) + cerr << "--spare "; + if(param.runIndex) + cerr << "--runIndex "; + if(param.addWord) + cerr << "--addWord "; + if(param.runSpelling) + cerr << "--runSpelling "; + if(param.printGraph) + cerr << "--printGraph "; + if(!param.trieDirectionForward) + cerr << "--reverse "; + if(!param.withoutTemplate) + cerr << "--withoutTemplate "; + if(param.subWord) { + cerr << "--subWord "; + if(param.listOfHyperwords.size()){ + cerr << "--listOfHyperwords=" << param.listOfHyperwords << " "; + } + } + if(param.composed) + cerr << "--composed=" << param.inputDicoComp << " "; + cerr << "--charSize=" << param.charSize; + if(param.inputDico.size()) { + cerr << "--input='" << param.inputDico << "' "; + } + if(param.outputDico.size()) { + cerr << "--output='" << param.outputDico << "' "; + } + if(param.listOfWords.size()) { + cerr << "--listOfWords='" << param.listOfWords << "'"; + } + cerr << endl; + + DictTester *wspareTester16=0; + DictTester *wbuilderTester16=0; + DictTester *wbuilderRandomTester16=0; + + if( (!param.spareMem) && (param.addWord) ) { + // Si Builder avec option addWord: BuilderRandom + std::cerr << "Create BuilderRandom dictionary...." << std::endl; + Lima::Common::FsaAccess::FsaAccessBuilderRandom16 *dico=0; + if(param.trieDirectionForward) { + dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(); + } + else { + dico = new Lima::Common::FsaAccess::FsaAccessBuilderRandom16(false); + } + if( param.inputDico.size() > 0) { + std::cerr << "Read dictionary from file... " + << param.inputDico << "..." << std::endl; + dico->read(param.inputDico); + } + wbuilderRandomTester16 = new + DictTester( param, *dico ); + if( param.listOfWords.size() > 0 ) { + std::cerr << "addListOfRandomWords " + << param.listOfWords << "..." << std::endl; + wbuilderRandomTester16->addListOfUnorderedWords(); + } + wbuilderRandomTester16->exec(); + wbuilderRandomTester16->write(); + } + + else if ( !param.spareMem) { + // Si Builder sans option addWord: Builder + std::cerr << "Create dictionary...." << std::endl; + Lima::Common::FsaAccess::FsaAccessBuilder16 *dico=0; + if(param.trieDirectionForward) { + dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(); + } + else { + dico = new Lima::Common::FsaAccess::FsaAccessBuilder16(false); + } + if( param.inputDico.size() > 0) { + std::cerr << "no read operation allowed for FsaAccessBuilder " + << std::endl; + return EXIT_FAILURE; + } + + wbuilderTester16 = new + DictTester( param, *dico ); + if( param.listOfWords.size() > 0 ) { + std::cerr << "addListOfWords " + << param.listOfWords << "..." << std::endl; + wbuilderTester16->addListOfWords(); + } + wbuilderTester16->exec(); + wbuilderTester16->write(); + } + else { + int refSize = 1; + int memSize = 0; + int memSize0 = 0; + if( param.runPerfo ) { + refSize = logFileSize( param.listOfWords ); + logFileSize( param.inputDico ); + memSize0 = getProcStat( std::string("VmSize") ); + std::cout << "procSize before load dico = " << memSize0 << std::endl; + TimeUtils::updateCurrentTime(); + } + Lima::Common::FsaAccess::FsaAccessSpare16 *dico = + new Lima::Common::FsaAccess::FsaAccessSpare16(); + dico->read(param.inputDico); + if( param.runPerfo ) { + TimeUtils::logElapsedTime("load dico"); + memSize = getProcStat( std::string("VmSize") ); + std::cout << "procSize after load dico = " << memSize << std::endl; + std::cout << "dico size in mem = " << memSize - memSize0 << std::endl; + std::cout << "compression rate = " << ((memSize - memSize0)*102400.0)/refSize << "%" << std::endl; + } + if( param.printGraph ) { + std::cerr << "Print graph...." << std::endl; + dico->printGraph(std::cerr); + } + wspareTester16 = new + DictTester( + param, *dico ); + + +/* + Lima::LimaString lcwlem0(Misc::utf8stdstring2limastring("b")); + std::cerr << "lcwlem0=" << lcwlem0 << std::endl; + Lima::LimaString & stlem0 = lcwlem0; + Lima::LimaString & stlem1 = lcwlem1; + Lima::LimaString & stlem2 = lcwlem2; + Lima::LimaString & stlem3 = lcwlem3; + Lima::LimaString & stlem4 = lcwlem4; + Lima::LimaString & stlem5 = lcwlem5; + Lima::LimaString & stlem6 = lcwlem6; + Lima::LimaString & stlem7 = lcwlem7; +*/ + if( param.runIndex ) { + std::cerr << "runIndex" << std::endl; + std::vector listOfWords; + std::vector indexes; + + if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + } + else { + Lima::LimaString lcwlem1(Misc::utf8stdstring2limastring("béc")); + Lima::LimaString lcwlem2(Misc::utf8stdstring2limastring("séc")); + Lima::LimaString lcwlem3(Misc::utf8stdstring2limastring("sél")); + Lima::LimaString lcwlem4(Misc::utf8stdstring2limastring("sé")); + Lima::LimaString lcwlem5(Misc::utf8stdstring2limastring("s")); + Lima::LimaString lcwlem6(Misc::utf8stdstring2limastring("truc")); + Lima::LimaString lcwlem7(Misc::utf8stdstring2limastring("table")); + listOfWords.push_back( Lima::LimaString(lcwlem1) ); + indexes.push_back(1); + listOfWords.push_back( Lima::LimaString(lcwlem2) ); + indexes.push_back(2); + listOfWords.push_back( Lima::LimaString(lcwlem3) ); + indexes.push_back(3); + listOfWords.push_back( Lima::LimaString(lcwlem4) ); + indexes.push_back(4); + listOfWords.push_back( Lima::LimaString(lcwlem5) ); + indexes.push_back(-1); + listOfWords.push_back( Lima::LimaString(lcwlem6) ); + indexes.push_back(-1); + listOfWords.push_back( Lima::LimaString(lcwlem7) ); + indexes.push_back(-1); + }; + std::cerr << "testIndex" << std::endl; +// for( int i = 10 ; i > 0 ; i-- ) + TimeUtils::updateCurrentTime(); + wspareTester16->testIndex(listOfWords.begin(), listOfWords.end(), indexes ); + uint64_t elapsed = TimeUtils::elapsedTime(); + TimeUtils::logElapsedTime("testIndex"); + std::cout << "key average size = " << (refSize*1.0)/dico->getSize() << " byte" << std::endl; + std::cout << "testIndex: average time = " << (elapsed*1000.0)/dico->getSize() << std::endl; + } + + if( param.runSpelling ) { + std::vector listOfWords; + std::vector indexes; + + // case 1: ask for spelling of a word given a termId + if( param.termId > 0 ) { + indexes.push_back(param.termId); + std::cerr << "testSpelling with unique termId " << indexes[0] << std::endl; + } + // case 2: check if getSpelling is ok for every id + // (listOfWords is supposed to contain the complete ordered list of terms + else if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + int index = 1; + for( std::vector::const_iterator it = listOfWords.begin() ; + it != listOfWords.end() ; it++ ) { + indexes.push_back(index++); + std::cerr << "testSpelling with list of " << indexes.size() << " words" << std::endl; + } + } + wspareTester16->testSpelling(listOfWords.begin(), listOfWords.end(), indexes ); + } + if( param.superWord) { + std::cerr << "runSuper" << std::endl; + std::vector listOfWords; + Lima::LimaString vide; + listOfWords.push_back(vide); + + if( param.listOfWords.size() > 0 ) { + readListOfWords(param.listOfWords, listOfWords ); + } + wspareTester16->testSuper(listOfWords.begin(), listOfWords.end()); + } + + wspareTester16->exec(); + if( param.subWord) { + // cha�e �d�ouper + std::vector hyperwords; + // offset de localisation de l'hypermot dans la cha�e + std::vector offsets; + // r�onses du dictionnaire sur l'appel �getSubword + std::vector > subwords; + if( param.listOfHyperwords.size() > 0 ) { + std::ifstream Hlist(param.listOfHyperwords.c_str(), std::ios::in | std::ios::binary ); + if ( !Hlist.is_open() ) { + std::cerr << "Cannot open list of (hyperword,offset..) " << param.listOfHyperwords << std::endl; + return EXIT_FAILURE; + } + std::cerr << "Read hyperword and offset...." << std::endl; + std::string line; + + for( int counter = 0 ; ; counter++ ) { + // lecture d'une ligne du fichier de test + line = Lima::Common::Misc::readLine(Hlist); + if( line.size() == 0 ) { + std::cerr << "end of list of (hyperword,offset)." << std::endl; + break; + } + else { + // extraction chaine a decouper + std::string::size_type hyperword_pos = line.find(';'); + std::string utf8_hyperword(line, 0, hyperword_pos); + Lima::LimaString hyperword = Lima::Common::Misc::utf8stdstring2limastring(utf8_hyperword); + hyperwords.push_back(hyperword); + std::cerr << "push(" << hyperword; +// std::cerr << "offset=" << hyperword_pos << std::endl; + // extraction offset + std::string::size_type offset_pos = line.find(';', hyperword_pos+1); + std::string offset_str(line, hyperword_pos+1, offset_pos-(hyperword_pos+1)); + int offset = std::atoi(offset_str.c_str()); + offsets.push_back(offset); + std::cerr << "," << offset; +// std::cerr << "offset=" << offset_pos << std::endl; + // extraction liste de r�onses attendues + std::vector answers; + std::string::size_type subword_pos0 = offset_pos; + std::string::size_type subword_pos = line.find(';', subword_pos0+1); + for( ; subword_pos != std::string::npos ; subword_pos = line.find(';', subword_pos0+1) ) { + std::string utf8_answer(line, subword_pos0+1, subword_pos-(subword_pos0+1)); + Lima::LimaString answer = Lima::Common::Misc::utf8stdstring2limastring(utf8_answer); + answers.push_back(answer); + std::cerr << "," << answer; +// std::cerr << "offset=" << subword_pos << std::endl; + subword_pos0 = subword_pos; + } + subwords.push_back(answers); + std::cerr << ")" << std::endl; + } + } + } + else { + Lima::LimaString lcwhyper1(Misc::utf8stdstring2limastring("séc")); + Lima::LimaString lcwhyper2(Misc::utf8stdstring2limastring("abcséc")); + Lima::LimaString lcwhyper3(Misc::utf8stdstring2limastring("truc")); + Lima::LimaString & stlem1 = lcwhyper1; + Lima::LimaString & stlem2 = lcwhyper2; + Lima::LimaString & stlem3 = lcwhyper3; + + hyperwords.push_back(Lima::LimaString(stlem1)); // s� + offsets.push_back(0); + hyperwords.push_back(Lima::LimaString(stlem2)); // abcs� + offsets.push_back(3); + hyperwords.push_back(Lima::LimaString(stlem3)); // truc + offsets.push_back(0); + } + wspareTester16->testSub(hyperwords, offsets, subwords, param.withAssert); + } +// wspareTester16->write(); + } + + return EXIT_SUCCESS; +} + + diff --git a/lima_linguisticdata/SRLIntegration/CMakeLists.txt b/lima_linguisticdata/SRLIntegration/CMakeLists.txt index c4e94b3fd..8021cec02 100644 --- a/lima_linguisticdata/SRLIntegration/CMakeLists.txt +++ b/lima_linguisticdata/SRLIntegration/CMakeLists.txt @@ -32,6 +32,17 @@ foreach(LANG ${LIMA_LANGUAGES}) COMMENT "create config env for srl rules (VerbNet-modex.xml)" VERBATIM ) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + COMMENT "create config env for srl rules (FrameNet-modex.xml)" + VERBATIM + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${LANG}.xml COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config @@ -88,6 +99,15 @@ foreach(LANG ${LIMA_LANGUAGES}) DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-analysis.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common.xml ) + add_custom_target( + rules-${LANG}-FrameNet-configEnv + ALL + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${LANG}.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-lp-${LANG}.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-analysis.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common.xml + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/resources/LinguisticProcessings/${LANG}/code-${LANG}.xml @@ -137,7 +157,34 @@ foreach(LANG ${LIMA_LANGUAGES}) endif () + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules) + + add_custom_command( + OUTPUT FrameNet-${LANG}.bin + COMMAND compile-rules --resourcesDir=${CMAKE_BINARY_DIR}/execEnv/resources --configDir=${CMAKE_BINARY_DIR}/execEnv/config --language=${LANG} -oFrameNet-${LANG}.bin ${_current} --modex=FrameNet-modex.xml ${CMAKE_CURRENT_SOURCE_DIR}/FrameNet-${LANG}.rules + DEPENDS ${_current} ${DEPENDENCIES} +# WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + VERBATIM + ) + + add_custom_target( + rules-FrameNet-${LANG}-main + ALL + DEPENDS FrameNet-${LANG}.bin + ) + + # add the link between the current target and its execution environment dependencies + add_dependencies(rules-FrameNet-${LANG}-main rules-${LANG}-FrameNet-configEnv-main rules-${LANG}-execEnv) + + add_dependencies(rules-FrameNet-${LANG}-main rules-${LANG}-FrameNet-configEnv srl-rules-${LANG}-execEnv) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/FrameNet-${LANG}.bin COMPONENT ${LANG} DESTINATION share/apps/lima/resources/SRLIntegration) + + + endif () + endforeach(LANG ${LIMA_LANGUAGES}) -install(FILES VerbNet-modex.xml COMPONENT common DESTINATION share/config/lima) +install(FILES FrameNet-modex.xml VerbNet-modex.xml COMPONENT common DESTINATION share/config/lima) diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules new file mode 100644 index 000000000..4a7423057 --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-eng.rules @@ -0,0 +1,5 @@ +set encoding=utf8 +using modex FrameNet-modex.xml +using groups FrameNet +set defaultAction=>CreateSpecificEntity() + diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules new file mode 100644 index 000000000..bc6c62c66 --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-fre.rules @@ -0,0 +1,6 @@ +set encoding=utf8 +using modex FrameNet-modex.xml +using groups FrameNet +set defaultAction=>CreateSpecificEntity() + + diff --git a/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml b/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml new file mode 100644 index 000000000..418688b1e --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/FrameNet-modex.xml @@ -0,0 +1,2231 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt index e3ef41cb5..58f8e8643 100644 --- a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt +++ b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/CMakeLists.txt @@ -1 +1,7 @@ -install(FILES mapping_conll_Lima.txt COMPONENT common DESTINATION share/apps/lima/resources/SRLIntegration/lima_conll_dependency_tag_mapping) +install( +FILES + mapping_conll_Lima.txt + mapping_conll_lima_fre.txt +COMPONENT + common +DESTINATION share/apps/lima/resources/SRLIntegration/lima_conll_dependency_tag_mapping) diff --git a/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt new file mode 100644 index 000000000..0ffd6ec5a --- /dev/null +++ b/lima_linguisticdata/SRLIntegration/lima_conll_dependency_tag_mapping/mapping_conll_lima_fre.txt @@ -0,0 +1,40 @@ +acl acl +ADVADJ advmod +ADVADV advmod +AdvVerbe advmod +ADJPRENSUB amod +ATB_S cop +aux aux +auxpass auxpass +COD_V dobj +CodPrev dobj +COORD1 cc +COORD2 cc +COMPDUNOM nmod +COMPL ccomp +CPLV_V advmod +CPL_V iobj +DetAdj det +DetIntSub det +det det +DETSUB det +Dummy dep +Neg neg +MOD_V iobj +PrepDetInt case +PrepInf mark +PronSujVerbe nsubj +SUBADJPOST amod +SujInv nsubj +SUJ_V nsubj +PREPSUB case +MOD_A amod +MOD_N amod +SUBSUBJUX compound +APPOS appos +COMPADJ amod +COMPADV advmod +Pleon nsubj +PrepPartPres case +PrepPronRel case +PronReflVerbe expl diff --git a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml index 810c6a05b..df31a51b0 100644 --- a/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml +++ b/lima_linguisticdata/SpecificEntities/conf/Numex-modex.xml @@ -105,7 +105,7 @@ - + @@ -124,7 +124,7 @@ - + diff --git a/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules b/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules index bb50c0d43..434b7bdca 100644 --- a/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules +++ b/lima_linguisticdata/SpecificEntities/fre/Numex/NUMBER-fre.rules @@ -198,6 +198,36 @@ LE:::NOT_NUMBER: # In sport scores like "1 - 1", each integer is a number and the - is not a minus sign @NumForm:[@NumForm] [(+|-)]::NUMBER:=>NormalizeNumber() + +@Decimal=(t_comma_number,t_dot_number) +@SmallDecimalGroup=(t_integer<100) +@IntegerGroup=(t_integer>99<1000) +@LargeNumber=(t_integer>1000) + +### Numbers in digits + +# 1 234.5 +# 12 345.6 +# 12 345 678.9 +# but also errors like: 12 345 6.7 +@SmallDecimalGroup:(+|-)?:@IntegerGroup{0-3} (@Decimal|@IntegerGroup) @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 123 456.7 +@IntegerGroup:(+|-)?:@IntegerGroup{0-3} (@Decimal|@IntegerGroup) @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 12 +@SmallDecimalGroup:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + +# 12345 +# 1234 millions +@LargeNumber:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + + +# 12.3 +# 123.4 +# 12345.6 +@Decimal:(+|-)?:@ChiffreAines? \%?:NUMBER:=>NormalizeNumber() + @NumForm:(+|-)?:(@NumForm|@Number)? \%?:NUMBER:=>NormalizeNumber() -@Number:(+|-)?:@Number{0-3} \%?:NUMBER:=>NormalizeNumber() +@Number:(+|-)?:@Number{0-5} @ChiffreAines? \%?:NUMBER:=>NormalizeNumber() @OrdNumber:(+|-)?:\%?:NUMBER:=>NormalizeNumber() diff --git a/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic b/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic index 9a0317ebb..5e8716016 100644 --- a/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic +++ b/lima_linguisticdata/analysisDictionary/eng/freeling/numbers.dic @@ -98,105 +98,3 @@ twenty-six twenty-six CD twenty-three twenty-three CD twenty-two twenty-two CD two two CD - -8 eight CD -18 eighteen CD -80 eighty CD -88 eighty-eight CD -85 eighty-five CD -84 eighty-four CD -89 eighty-nine CD -81 eighty-one CD -87 eighty-seven CD -86 eighty-six CD -83 eighty-three CD -82 eighty-two CD -7 eleven CD -15 fifteen CD -50 fifty CD -58 fifty-eight CD -55 fifty-five CD -54 fifty-four CD -59 fifty-nine CD -51 fifty-one CD -57 fifty-seven CD -56 fifty-six CD -53 fifty-three CD -52 fifty-two CD -5 five CD -40 forty CD -48 forty-eight CD -45 forty-five CD -44 forty-four CD -49 forty-nine CD -41 forty-one CD -47 forty-seven CD -46 forty-six CD -43 forty-three CD -42 forty-two CD -4 four CD -14 fourteen CD -9 nine CD -19 nineteen CD -90 ninety CD -98 ninety-eight CD -95 ninety-five CD -94 ninety-four CD -99 ninety-nine CD -91 ninety-one CD -97 ninety-seven CD -96 ninety-six CD -93 ninety-three CD -92 ninety-two CD -1 one CD -1,000 one-hundred CD -1000 one-hundred CD -7 seven CD -17 seventeen CD -70 seventy CD -78 seventy-eight CD -75 seventy-five CD -74 seventy-four CD -79 seventy-nine CD -71 seventy-one CD -77 seventy-seven CD -76 seventy-six CD -73 seventy-three CD -72 seventy-two CD -6 six CD -16 sixteen CD -60 sixty CD -68 sixty-eight CD -65 sixty-five CD -64 sixty-four CD -69 sixty-nine CD -61 sixty-one CD -67 sixty-seven CD -66 sixty-six CD -63 sixty-three CD -62 sixty-two CD -10 ten CD -13 thirteen CD -30 thirty CD -38 thirty-eight CD -35 thirty-five CD -34 thirty-four CD -39 thirty-nine CD -31 thirty-one CD -37 thirty-seven CD -36 thirty-six CD -33 thirty-three CD -32 thirty-two CD -3 three CD -12 twelve CD -20 twenty CD -28 twenty-eight CD -25 twenty-five CD -24 twenty-four CD -29 twenty-nine CD -21 twenty-one CD -27 twenty-seven CD -26 twenty-six CD -23 twenty-three CD -22 twenty-two CD -2 two CD diff --git a/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt b/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt index bcb0316c6..fc5740bf6 100644 --- a/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt +++ b/lima_linguisticdata/analysisDictionary/fre/convert/default-fre.txt @@ -253,3 +253,8 @@ t_ordinal_integer Aoz--mp t_ordinal_integer Aoz--fp t_pattern Ea t_fallback Ea +t_url Npgms- +t_url Npgfs- +t_url Npgmp- +t_url Npgfp- +t_url Ee \ No newline at end of file diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt index b67191979..5ecac0c46 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-1.txt @@ -225903,6 +225903,8 @@ chiliens 100 nc [pred="chilien_____1",cat chilio-_ 100 adjPref [pred="chilio-______1",cat=adv] chilio-______1 Default %default pref chimie 100 nc [pred="chimie_____1",cat=nc,@fs] chimie_____1 Default fs %default nc-2f chimies 100 nc [pred="chimie_____1",cat=nc,@fp] chimie_____1 Default fp %default nc-2f +chimio 100 nc [pred="chimiothérapie_____1",cat=nc,@fs] chimiothérapie_____1 Default fs %default nc-2f +chimios 100 nc [pred="chimiothérapie_____1",cat=nc,@fp] chimiothérapie_____1 Default fp %default nc-2f chimiothérapie 100 nc [pred="chimiothérapie_____1",cat=nc,@fs] chimiothérapie_____1 Default fs %default nc-2f chimiothérapies 100 nc [pred="chimiothérapie_____1",cat=nc,@fp] chimiothérapie_____1 Default fp %default nc-2f chimique 100 adj [pred="chimique_____1",@pers,cat=adj,@s] chimique_____1 Default s %adj_personnel adj-ique2 diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt index ba2ea6eef..7b6751651 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-2.txt @@ -35086,6 +35086,7 @@ flashé 100 v [pred="flasher_____1",@active flashée 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kfs] flasher_____1 PastParticiple Kfs %actif v-er:std flashées 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kfp] flasher_____1 PastParticiple Kfp %actif v-er:std flashés 100 v [pred="flasher_____1",@active,@pers,cat=v,@Kmp] flasher_____1 PastParticiple Kmp %actif v-er:std +flashy 100 adj [pred="flashy_____1",@pers,cat=adj] flashy_____1 Default %adj_personnel adj-1 flasque 100 adj [pred="flasque_____1",@pers,cat=adj,@s] flasque_____1 Default s %adj_personnel adj-2 flasque 100 nc [pred="flasque_____1",cat=nc,@s] flasque_____1 Default s %default nc-2 flasquement 100 advm [pred="flasquement_____1",clivee=+,cat=adv] flasquement_____1 Default %default adv diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt index 33ce70bce..18e336af9 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-3.txt @@ -24681,6 +24681,8 @@ smaltines 100 nc [pred="smaltine_____1",c smalts 100 nc [pred="smalt_____1",cat=nc,@mp] smalt_____1 Default mp %default nc-2m smaragdite 100 nc [pred="smaragdite_____1",cat=nc,@fs] smaragdite_____1 Default fs %default nc-2f smaragdites 100 nc [pred="smaragdite_____1",cat=nc,@fp] smaragdite_____1 Default fp %default nc-2f +smartphone 100 nc [pred="smartphone_____1",cat=nc,@ms] smartphone_____1 Default ms %default nc-2m +smartphones 100 nc [pred="smartphone_____1",cat=nc,@mp] smartphone_____1 Default mp %default nc-2m smash 100 nc [pred="smash_____1",cat=nc,semtype=event|-,@ms] smash_____1 Default ms %default 0 smasha 100 v [pred="smasher_____1",@pers,cat=v,@J3s] smasher_____1 ThirdSing J3s %actif v-er:std smashai 100 v [pred="smasher_____1",@pers,cat=v,@J1s] smasher_____1 Default J1s %actif v-er:std @@ -42805,6 +42807,8 @@ stimula 100 v [pred="stimuler_____1",@pers, stimula 100 v [pred="stimuler_____1",@pers,cat=v,@J3s] stimuler_____1 ThirdSing J3s %actif v-er:std stimula 100 v [pred="stimuler_____1se",@pers,@se_moyen,@être,cat=v,@J3s] stimuler_____1 ThirdSing J3s %se_moyen v-er:std stimula 100 v [pred="stimuler_____2",@pers,cat=v,@J3s] stimuler_____2 ThirdSing J3s %actif v-er:std +stimulable 100 adj [pred="stimulable_____1",@pers,cat=adj,@s] stimulable_____1 Default s %adj_personnel adj-2 +stimulables 100 adj [pred="stimulable_____1",@pers,cat=adj,@p] stimulable_____1 Default p %adj_personnel adj-2 stimulai 100 v [pred="stimuler_____1",@pers,cat=v,@J1s] stimuler_____1 Default J1s %actif v-er:std stimulai 100 v [pred="stimuler_____1",@pers,cat=v,@J1s] stimuler_____1 Default J1s %actif v-er:std stimulai 100 v [pred="stimuler_____1se",@pers,@se_moyen,@être,cat=v,@J1s] stimuler_____1 Default J1s %se_moyen v-er:std diff --git a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic index c7899c8d4..62b87e12e 100644 --- a/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic +++ b/lima_linguisticdata/analysisDictionary/fre/lefff/lefff-ext-lima.dic @@ -38,46 +38,46 @@ $ mâle Afha--- -> -> Ff -> -> Ff -ce ce Ppcsa---- --elle cln Ppcsa3-fs --elles cln Ppcsa3-fp --en clg Ppcda1-ms --en cll Ppcda1-ms --il cln Ppcsa3-ms +-elle elle Ppcsa3-fs +-elles elle Ppcsa3-fp +-en en Ppcda1-ms +-en en Ppcda1-ms +-il il Ppcsa3-ms -il ilimp Ppcsa---- --ils cln Ppcsa3-mp --je cln Ppcsa1-ms --la cla Ppcda3-fs --le cla Ppcda3-ms --les cla Ppcda3-mp --leur cld Ppcda3-mp --lui cld Ppcda3-ms --m' cld Ppcda1-ms --moi cla Ppcda1-ms --moi cld Ppcda1-ms --nous cla Ppcda1-mp --nous cld Ppcda1-mp --nous cln Ppcsa1-mp --on cln Ppcsa3-ms --t' cld Ppcda2-ms --t-elle cln Ppcsa3-fs --t-elles cln Ppcsa3-fp --t-en cll Ppcda1-ms --t-il cln Ppcsa3-ms --t-il ilimp Ppcsa---- --t-ils cln Ppcsa3-mp --t-on cln Ppcsa3-ms --t-y cld Ppcda1-ms --toi cla Ppcda2-ms --toi cld Ppcda2-ms --tu cln Ppcsa2-ms --vous cla Ppcda2-mp --vous cld Ppcda2-mp --vous cln Ppcsa2-mp --vs cla Ppcda2-mp --vs cld Ppcda2-mp --vs cln Ppcsa2-mp --y cld Ppcda1-ms --y cll Ppcda1-ms +-ils ils Ppcsa3-mp +-je je Ppcsa1-ms +-la la Ppcda3-fs +-le le Ppcda3-ms +-les les Ppcda3-mp +-leur leur Ppcda3-mp +-lui lui Ppcda3-ms +-m' me Ppcda1-ms +-moi moi Ppcda1-ms +-moi moi Ppcda1-ms +-nous nous Ppcda1-mp +-nous nous Ppcda1-mp +-nous nous Ppcsa1-mp +-on on Ppcsa3-ms +-t' te Ppcda2-ms +-t-elle elle Ppcsa3-fs +-t-elles elle Ppcsa3-fp +-t-en en Ppcda1-ms +-t-il il Ppcsa3-ms +-t-il il Ppcsa---- +-t-ils il Ppcsa3-mp +-t-on on Ppcsa3-ms +-t-y y Ppcda1-ms +-toi toi Ppcda2-ms +-toi toi Ppcda2-ms +-tu tu Ppcsa2-ms +-vous vous Ppcda2-mp +-vous vous Ppcda2-mp +-vous vous Ppcsa2-mp +-vs vous Ppcda2-mp +-vs vous Ppcda2-mp +-vs vous Ppcsa2-mp +-y y Ppcda1-ms +-y y Ppcda1-ms . . Ff ... ... Fa / ou Cc @@ -135127,7 +135127,7 @@ cetteputainde ceputainde Dd--fs- ceux celui Pd-----mp ceux-ci celui-ci Pd-----mp ceux-là celui-là Pd-----mp -ch' cln Ppcsa1-ms +ch' je Ppcsa1-ms ch'ti petit Afha-ms ch. chapitre Ncgf-- cha-cha-cha cha-cha-cha Ncgm-- @@ -140181,6 +140181,8 @@ chiliens chilien Ncgmp- chilio-_ chilio-_1 Ep chimie chimie Ncgfs- chimies chimie Ncgfp- +chimio chimiothérapie Ncgfs- +chimios chimiothérapie Ncgfp- chimiothérapie chimiothérapie Ncgfs- chimiothérapies chimiothérapie Ncgfp- chimique chimique Afha--- @@ -253163,8 +253165,8 @@ eldorado eldorado Ncgms- eldorados eldorado Ncgmp- elfe elfe Ncgms- elfes elfe Ncgmp- -elle cln Ppcsa3-fs -elles cln Ppcsa3-fp +elle elle Ppcsa3-fs +elles elle Ppcsa3-fp ellipse ellipse Ncgfs- ellipses ellipse Ncgfp- ellipsoïdal ellipsoïdal Afha-ms @@ -260140,8 +260142,8 @@ empêtrées empêtré Afha-fp empêtrés empêtrer Vppsi-mp empêtrés empêtrer Vppsp-mp empêtrés empêtré Afha-mp -en clg Ppcda1-ms -en cll Ppcda1-ms +en en Ppcda1-ms +en en Ppcda1-ms en en Sg en-cours en-cours Ncgm-- en-deçà en-deçà Rg @@ -336716,7 +336718,7 @@ ijawe ijaw Afha-fs ijawes ijaw Afha-fp ijaws ijaw Afha-mp ijaws ijaw Ncgmp- -il cln Ppcsa3-ms +il il Ppcsa3-ms il ilimp Ppcsa---- ilang ilang Ncgms- ilang-ilang ilang-ilang Ncgms- @@ -337129,7 +337131,7 @@ illégitimité illégitimité Ncgfs- illégitimités illégitimité Ncgfp- ilote ilote Ncgms- ilotes ilote Ncgmp- -ils cln Ppcsa3-mp +ils ils Ppcsa3-mp ilya ilya Sg iléal iléal Afha-ms iléale iléal Afha-fs @@ -356564,7 +356566,7 @@ ixia ixia Ncgfs- ixias ixia Ncgfp- ixode ixode Ncgms- ixodes ixode Ncgmp- -j' cln Ppcsa1-ms +j' je Ppcsa1-ms jabiru jabiru Ncgms- jabirus jabiru Ncgmp- jabla jabler Vpisi3-s @@ -358156,7 +358158,7 @@ jdanoviennes jdanovien Afha-fp jdanoviens jdanovien Afha-mp jdanovo-maoïste jdanovo-maoïste Afha--- jdanovo-maoïstes jdanovo-maoïste Afha--- -je cln Ppcsa1-ms +je je Ppcsa1-ms je-m'en-fichisme je-m'en-fichisme Ncgms- je-m'en-fichismes je-m'en-fichisme Ncgmp- je-m'en-fichiste je-m'en-fichiste Afha--- @@ -361518,14 +361520,14 @@ kérogène kérogène Ncgms- kérogènes kérogène Ncgmp- kérosène kérosène Ncgms- kérosènes kérosène Ncgmp- -l' cla Ppcda3-fs -l' cla Ppcda3-ms +l' la Ppcda3-fs +l' le Ppcda3-ms l' le Da--msd l'autre l'autre Pi-----ms -l'on cln Ppcsa3-ms +l'on on Ppcsa3-ms l'un l'un Pi-----ms l'une l'un Pi-----fs -la cla Ppcda3-fs +la la Ppcda3-fs la le Da--fsd laVarenne-Saint-Hilaire La-Varenne-Saint-Hilaire Npgfs- laVarenne-St-Hilaire La-Varenne-Saint-Hilaire Npgfs- @@ -365081,7 +365083,7 @@ laïussé laïusser Vppsi-ms laïussée laïusser Vppsi-fs laïussées laïusser Vppsi-fp laïussés laïusser Vppsi-mp -le cla Ppcda3-ms +le la Ppcda3-ms le le Da--msd le__det le Da--msd leader leader Ncgms- @@ -365259,7 +365261,7 @@ lequel lequel Pr-n---ms lequel lequel Pt-d---ms lequel lequel Pt-n---ms lerche lerche Rg -les cla Ppcda3-mp +les le Ppcda3-mp les le Da--mpd les__det le Da--mpd lesautres l'autre Pi-----mp @@ -365564,7 +365566,7 @@ leucémique leucémique Ncgms- leucémiques leucémique Ncgmp- leude leude Ncgms- leudes leude Ncgmp- -leur cld Ppcda3-mp +leur sien Ppcda3-mp leur son Ds3pms- leurra leurrer Vpisi3-s leurra leurrer Vpisp3-s @@ -370714,10 +370716,10 @@ lugé luger Vppsi-ms lugée luger Vppsi-fs lugées luger Vppsi-fp lugés luger Vppsi-mp -lui cld Ppcda3-ms -lui-même cln Ppcsa3-ms -lui-même cln Px---1-ms -lui-même cln Px---3-ms +lui lui Ppcda3-ms +lui-même lui-même Ppcsa3-ms +lui-même lui-même Px---1-ms +lui-même lui-même Px---3-ms luira luire Vpifi3-s luirai luire Vpifi1-s luiraient luire Vpici3-p @@ -372749,9 +372751,9 @@ lût lire Vpsii3-s lût lire Vpsip3-s lûtes lire Vpisi2-p lûtes lire Vpisp2-p -m' cla Ppcda1-ms -m' cld Ppcda1-ms -m' clr Px---1-ms +m' m' Ppcda1-ms +m' m' Ppcda1-ms +m' m' Px---1-ms m'as-tu-vu m'as-tu-vu Ncgm-- m'as-tu-vue m'as-tu-vu Ncgf-- ma son Ds1sfs- @@ -381457,9 +381459,9 @@ maïserie maïserie Ncgfs- maïseries maïserie Ncgfp- maïzena maïzena Ncgfs- maïzenas maïzena Ncgfp- -me cla Ppcda1-ms -me cld Ppcda1-ms -me clr Px---1-ms +me me Ppcda1-ms +me me Ppcda1-ms +me me Px---1-ms mea-culpa mea-culpa Ncgm-- meaculpa meaculpa Ncgm-- meau meau Ncgmp- @@ -386919,8 +386921,8 @@ mogols mogol Afha-mp mogols mogol Ncgmp- mohair mohair Ncgms- mohairs mohair Ncgmp- -moi cla Ppcda1-ms -moi cld Ppcda1-ms +moi moi Ppcda1-ms +moi moi Ppcda1-ms moi moi Ncgms- moie moie Ncgfs- moies moie Ncgfp- @@ -402505,10 +402507,10 @@ nourrît nourrir Vpsii3-s nourrît nourrir Vpsip3-s nourrîtes nourrir Vpisi2-p nourrîtes nourrir Vpisp2-p -nous cla Ppcda1-mp -nous cld Ppcda1-mp -nous cln Ppcsa1-mp -nous clr Px---1-mp +nous nous Ppcda1-mp +nous nous Ppcda1-mp +nous nous Ppcsa1-mp +nous nous Px---1-mp nouure nouure Ncgfs- nouures nouure Ncgfp- nouveau nouveau Afha-ms @@ -408188,7 +408190,7 @@ omît omettre Vpsii3-s omît omettre Vpsip3-s omîtes omettre Vpisi2-p omîtes omettre Vpisp2-p -on cln Ppcsa3-ms +on on Ppcsa3-ms on-dit on-dit Ncgm-- on-line on-line Afha--- on-lines on-line Afha--- @@ -527523,12 +527525,12 @@ rôtîtes rôtir Vpisi2-p rôtîtes rôtir Vpisp2-p röntgen röntgen Ncgms- röntgens röntgen Ncgmp- -s' clar Px---3-mp -s' clar Px---3-ms -s' cldr Px---3-mp -s' cldr Px---3-ms -s' clr Px---3-mp -s' clr Px---3-ms +s' se Px---3-mp +s' se Px---3-ms +s' se Px---3-mp +s' se Px---3-ms +s' se Px---3-mp +s' se Px---3-ms s'agissant s'agissant Sg s'ilteplaît s'ilteplaît Rg s'ilvousplaît s'ilvousplaît Rg @@ -535302,12 +535304,12 @@ scénographique scénographique Afha--- scénographiques scénographique Afha--- scénologie scénologie Ncgfs- scénologies scénologie Ncgfp- -se clar Px---3-mp -se clar Px---3-ms -se cldr Px---3-mp -se cldr Px---3-ms -se clr Px---3-mp -se clr Px---3-ms +se se Px---3-mp +se se Px---3-ms +se se Px---3-mp +se se Px---3-ms +se se Px---3-mp +se se Px---3-ms seau seau Ncgms- sebka sebka Ncgfs- sebkas sebka Ncgfp- @@ -548902,6 +548904,8 @@ stimugène stimugène Ncgms- stimugènes stimugène Ncgmp- stimula stimuler Vpisi3-s stimula stimuler Vpisp3-s +stimulable stimulable Afha--- +stimulables stimulable Afha--- stimulai stimuler Vpisi1-s stimulai stimuler Vpisp1-s stimulaient stimuler Vpiii3-p @@ -563716,10 +563720,10 @@ sût savoir Vpsii3-s sût savoir Vpsip3-s sûtes savoir Vpisi2-p sûtes savoir Vpisp2-p -t' cla Ppcda2-ms -t' cld Ppcda2-ms -t' cln Ppcsa2-ms -t' clr Px---2-ms +t' tu Ppcda2-ms +t' tu Ppcda2-ms +t' tu Ppcsa2-ms +t' tu Px---2-ms t-shirt t-shirt Ncgms- t-shirts t-shirt Ncgmp- t. tome Ncgm-- @@ -567993,9 +567997,9 @@ tchétchène tchétchène Afha--- tchétchène tchétchène Ncgms- tchétchènes tchétchène Afha--- tchétchènes tchétchène Ncgmp- -te cla Ppcda2-ms -te cld Ppcda2-ms -te clr Px---2-ms +te te Ppcda2-ms +te te Ppcda2-ms +te te Px---2-ms technicien technicien Ncgms- technicienne technicien Ncgfs- techniciennes technicien Ncgfp- @@ -572996,8 +573000,8 @@ togolaises togolais Afha-fp togolaises togolais Ncgfp- togolo-_ togolo-_1 Ep tohu-bohu tohu-bohu Ncgm-- -toi cla Ppcda2-ms -toi cld Ppcda2-ms +toi toi Ppcda2-ms +toi toi Ppcda2-ms toi toi Ncgms- toilage toilage Ncgms- toilages toilage Ncgmp- @@ -585383,7 +585387,7 @@ tsé-tsé tsé-tsé Ncgf-- tt tout Afha-ms tte tout Afha-fs ttes tout Afha-fp -tu cln Ppcsa2-ms +tu tu Ppcsa2-ms tu taire Vppsi-ms tu taire Vppsm-ms tu taire Vppsp-ms @@ -602772,10 +602776,10 @@ vouons vouer Vpipp1-p vouons vouer Vpmpp1-p vouons vouer Vpipt1-p vouons vouer Vpmpt1-p -vous cla Ppcda2-mp -vous cld Ppcda2-mp -vous cln Ppcsa2-mp -vous clr Px---2-mp +vous vous Ppcda2-mp +vous vous Ppcda2-mp +vous vous Ppcsa2-mp +vous vous Px---2-mp vousoie vousoyer Vpipt3-s vousoie vousoyer Vpmpt2-s vousoiement vousoiement Ncgms- @@ -603428,12 +603432,12 @@ vrombîmes vrombir Vpisi1-p vrombît vrombir Vpsii3-s vrombîtes vrombir Vpisi2-p vroum vroum I -vs cla Ppcda2-mp -vs cld Ppcda2-mp -vs cln Ppcsa2-mp -vs clr Px---2-mp -vs vs Sg -vs. vs Sg +vs vous Ppcda2-mp +vs vous Ppcda2-mp +vs vous Ppcsa2-mp +vs vous Px---2-mp +vs versus Sg +vs. versus Sg vu voir Vppsi-ms vu voir Vppsm-ms vu voir Vppsp-ms @@ -605094,8 +605098,8 @@ xérophytique xérophytique Afha--- xérophytiques xérophytique Afha--- xérus xérus Ncgm-- xérès xérès Ncgm-- -y cld Ppcda1-ms -y cll Ppcda1-ms +y y Ppcda1-ms +y y Ppcda1-ms yacht yacht Ncgms- yacht-club yacht-club Ncgms- yacht-clubs yacht-club Ncgmp- @@ -629835,3 +629839,6 @@ trois trois ADJNUM vingt vingt ADJNUM un un ADJNUM FOREIGN FOREIGN Ee +smartphone smartphone Ncgms- +smartphones smartphone Ncgmp- +flashy flashy Afha--- diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake index bac286f46..e2150b140 100644 --- a/lima_linguisticdata/cmake/LinguisticData.cmake +++ b/lima_linguisticdata/cmake/LinguisticData.cmake @@ -357,6 +357,10 @@ macro (LIMA_GENERIC_CONFIGENV _lang) # Add custom command to copy files to execEnv (rules to produce them) # and Add destitation files to lima-execEnv target's dependencies list + CustomCopyFileAndAddExecEnvDependency( + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + ) CustomCopyFileAndAddExecEnvDependency( ${CMAKE_SOURCE_DIR}/SRLIntegration/VerbNet-modex.xml ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml @@ -552,6 +556,17 @@ macro (SPECIFICENTITIESCONFIGENV _subtarget _lang _group) COMMENT "create config env for specific entities rules (${_group}-modex.xml)" VERBATIM ) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml + DEPENDS + ${CMAKE_SOURCE_DIR}/SRLIntegration/FrameNet-modex.xml + COMMENT "create config env for specific entities rules (FrameNet-modex.xml)" + VERBATIM + ) add_custom_command( OUTPUT ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/execEnv/config @@ -625,6 +640,7 @@ macro (SPECIFICENTITIESCONFIGENV _subtarget _lang _group) rules-${_lang}-${_group}-configEnv-${_subtarget} ALL DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/${_group}-modex.xml + DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/FrameNet-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/VerbNet-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/SpecificEntities-modex.xml DEPENDS ${CMAKE_BINARY_DIR}/execEnv/config/lima-common-${_lang}.xml diff --git a/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt b/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt index 7b0f2dcca..5014b5000 100644 --- a/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt +++ b/lima_linguisticdata/rules-idiom/fre/src/idioms-fre.txt @@ -3024,6 +3024,9 @@ ID;i;A;Quant;Quant à;préposition;quant à ID;i;A;Quant;Quant au;préposition article;quant à ID;i;A;Quant;Quant aux;préposition article pluriel;quant à ID;i;A;[D]aujourd';[D]aujourd' hui;adverbe;aujourd'hui +GC;i;A;[D]Aujourd';[D]Aujourd' hui;adverbe;aujourd'hui +GC;i;A;[D]aujourd’;[D]aujourd’ hui;adverbe;aujourd'hui +GC;i;A;[D]Aujourd’;[D]Aujourd’ hui;adverbe;aujourd'hui ID;i;A;extenso;in extenso;adverbe; ID;i;A;extremis;in extremis;adverbe; ID;i;A;facto;de facto;adverbe; diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok index 24b0f15db..c4e1d4bb1 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.chars.tok @@ -59,8 +59,8 @@ chars { 0029, RIGHT PARENTHESIS, c_del1 ; 002A, ASTERISK, c_del1 ; 002B, PLUS SIGN, c_plus ; -002C, COMMA, c_comma ; -002D, HYPHEN-MINUS, m_pattern ; +002C, COMMA, c_comma, u002C ; +002D, HYPHEN-MINUS, m_pattern, u002D ; 002E, FULL STOP, c_dot ; 002F, SOLIDUS, c_slash ; 0030, DIGIT ZERO, c_5, m0030 ; diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok index 60118658a..baa4bb172 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/eng/tokenizerAutomaton-eng.tok @@ -72,7 +72,7 @@ - c_M / SINGLE_UPPER (T_ALPHA,T_CAPITAL_1ST) - c_m / ALL_LOWER (T_ALPHA,T_SMALL) - c_5 / INTEGER (T_NUMERIC,T_INTEGER) - - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC) + - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC,T_INTEGER) - m_pattern m_pattern / PATTERN (T_PATTERN) - c_lowline / START - c_other / START @@ -80,6 +80,7 @@ - c_grave / GRAVE (T_WORD_BRK) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line / DELIMITER (T_WORD_BRK) - c_del2|c_dot c_par c_par / DELIMITER (T_PARAGRAPH_BRK) + - c_dot c_b c_dot / DELIMITER (T_WORD_BRK) - c_dot c_dot c_dot = START (T_SENTENCE_BRK) - c_del2|c_dot / DELIMITER (T_SENTENCE_BRK) - c_lowline / ALPHA @@ -98,6 +99,8 @@ (DELIMITER) { - m_eof = END - c_grave = GRAVE (T_WORD_BRK) + - c_b c_dot > DELIMITER + - c_dot c_b > DELIMITER - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|c_dot|m_line = DELIMITER (T_WORD_BRK) - c_5 = INTEGER (T_NUMERIC,T_INTEGER) - c_par = IGNORE (T_SENTENCE_BRK) diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok index 7498a74fe..d3751c9f8 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.chars.tok @@ -59,20 +59,20 @@ chars { 0029, RIGHT PARENTHESIS, c_del1 ; 002A, ASTERISK, c_del1 ; 002B, PLUS SIGN, c_plus ; -002C, COMMA, c_comma ; +002C, COMMA, c_comma, u002C ; 002D, HYPHEN-MINUS, m_pattern ; 002E, FULL STOP, c_dot ; 002F, SOLIDUS, c_slash ; -0030, DIGIT ZERO, c_5 ; -0031, DIGIT ONE, c_5 ; -0032, DIGIT TWO, c_5 ; -0033, DIGIT THREE, c_5 ; -0034, DIGIT FOUR, c_5 ; -0035, DIGIT FIVE, c_5 ; -0036, DIGIT SIX, c_5 ; -0037, DIGIT SEVEN, c_5 ; -0038, DIGIT EIGHT, c_5 ; -0039, DIGIT NINE, c_5 ; +0030, DIGIT ZERO, c_5, u0030 ; +0031, DIGIT ONE, c_5, u0031 ; +0032, DIGIT TWO, c_5, u0032 ; +0033, DIGIT THREE, c_5, u0033 ; +0034, DIGIT FOUR, c_5, u0034 ; +0035, DIGIT FIVE, c_5, u0035 ; +0036, DIGIT SIX, c_5, u0036 ; +0037, DIGIT SEVEN, c_5, u0037 ; +0038, DIGIT EIGHT, c_5, u0038 ; +0039, DIGIT NINE, c_5, u0039 ; 003A, COLON, c_del2 ; 003B, SEMICOLON, c_del2 ; 003C, LESS-THAN SIGN, c_del1 ; diff --git a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok index c4770b8e2..4419f3f6c 100644 --- a/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok +++ b/lima_linguisticdata/scratch/LinguisticProcessings/fre/tokenizerAutomaton-fre.tok @@ -25,7 +25,6 @@ - c_quote c_b c_del1 > ALPHA - c_quote c_del1 > ALPHA - unknwn > ALPHA - - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_del = START (T_WORD_BRK) - c_quote = START - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line > DELIMITER (T_WORD_BRK) @@ -39,7 +38,6 @@ - c_V > CARDINAL_ROMAN (T_NUMERIC,T_CARDINAL_ROMAN) - c_M > SINGLE_UPPER (T_CAPITAL,T_ALPHA) - c_m > ALL_LOWER (T_ALPHA,T_SMALL) - - c_5 > ALPHANUMERIC - c_hyphen|c_plus c_5 > INTEGER (T_NUMERIC,T_INTEGER) - m_pattern > PATTERN (T_PATTERN) - c_lowline > START @@ -81,7 +79,7 @@ - c_M / SINGLE_UPPER (T_ALPHA,T_CAPITAL_1ST) - c_m / ALL_LOWER (T_ALPHA,T_SMALL) - c_5 / INTEGER (T_NUMERIC,T_INTEGER) - - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC) + - c_hyphen|c_plus c_5 / INTEGER (T_NUMERIC,T_INTEGER) - m_pattern m_pattern / PATTERN (T_PATTERN) - c_lowline / START - c_other / START @@ -107,7 +105,7 @@ - m_eof = END - c_dot c_dot c_dot = SUSP1 (T_SENTENCE_BRK) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|c_dot|m_line = DELIMITER (T_WORD_BRK) - - c_5 = INTEGER + - c_5 = INTEGER (T_NUMERIC,T_INTEGER) - c_par = IGNORE (T_SENTENCE_BRK) - c_b = IGNORE - c_V = CARDINAL_ROMAN (T_NUMERIC,T_CARDINAL_ROMAN) @@ -128,7 +126,6 @@ - c_quote c_b c_del1 = ALPHA - c_quote c_del1 = ALPHA - unknwn = ALPHA - - c_5 = ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_del = START (T_WORD_BRK) - c_quote = START - c_all = START @@ -139,7 +136,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_M > ALL_UPPER (T_CAPITAL) - c_m > LOWER_1ST_UPPER - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_dot c_Mm c_dot > ACRONYM_1 (T_ACRONYM) - c_dot c_b > ABBREV (T_ABBREV) - c_b = IGNORE @@ -156,7 +153,7 @@ - c_hyphen c_a_t c_hyphen = TEUPHOT (T_ALPHA) - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_M > ALL_UPPER - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_m > LOWER_UPPER (T_CAPITAL_SMALL) - c_b = IGNORE - c_hyphen c_M > ALL_UPPER (T_HYPHEN_WORD) @@ -174,7 +171,7 @@ - c_quote c_Mm > APOS - c_m > ALL_LOWER - c_M > LOWER_UPPER (T_CAPITAL_SMALL) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_b = IGNORE - c_hyphen c_M > LOWER_UPPER (T_HYPHEN_WORD) - c_hyphen c_m > ALL_LOWER (T_HYPHEN_WORD) @@ -193,7 +190,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_m > LOWER_1ST_UPPER - c_M > LOWER_UPPER (T_CAPITAL_SMALL) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_quote c_Mm > APOS - c_hyphen c_M > LOWER_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) - c_hyphen c_m > LOWER_1ST_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) @@ -210,7 +207,7 @@ - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) - c_Mm > LOWER_UPPER - c_b = IGNORE - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_hyphen c_Mm > LOWER_UPPER (T_HYPHEN_WORD,T_CAPITAL_SMALL) - c_del1|c_comma|c_slash|c_hyphen|c_quote|c_percent|c_fraction|m_line = DELIMITER (T_WORD_BRK) - c_del2|c_dot = DELIMITER (T_SENTENCE_BRK) @@ -251,7 +248,7 @@ - c_l_eg c_l_m c_l_e | c_l_e c_l_m c_l_e > ORDINAL_ROMAN2 (T_NUMERIC,T_ORDINAL_ROMAN) - c_l_n c_l_d | c_l_e c_l_r > ORDINAL_ROMAN1 (T_NUMERIC,T_ORDINAL_ROMAN) - c_l_eg > ORDINAL_ROMAN (T_NUMERIC,T_ORDINAL_ROMAN) - - c_5 > ALPHANUMERIC + - c_5 > ALPHANUMERIC (T_ALPHANUMERIC) - c_dot c_b|c_M > SINGLE_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL_1ST) - [c_del|m_line|m_eof] [c_M] c_m > LOWER_1ST_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL_1ST) - [c_del|m_line|m_eof] [c_M] c_M > ALL_UPPER (T_NOT_ROMAN,T_ALPHA,T_CAPITAL) @@ -329,6 +326,12 @@ - c_all > START - m_eof = END } +(ORDINAL_INTEGER2) { + - c_all > ORDINAL_INTEGER1 +} +(ORDINAL_INTEGER1) { + - c_all > ORDINAL_INTEGER +} (ORDINAL_INTEGER) { - c_hyphen c_a_t c_hyphen = TEUPHOT (T_ALPHA) - c_hyphen c_a_t c_quote = TEUPHOT (T_ALPHA) diff --git a/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt b/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt new file mode 100644 index 000000000..2e85779b6 --- /dev/null +++ b/lima_linguisticdata/syntacticAnalysis/eng/rules-eng-finalize.txt @@ -0,0 +1,47 @@ +########################################################### +# +# rules to recopy dependencies pointing to coordinated +# tokens onto the other member of the coordination +# +# Created on Wed May 11 2016 +# by Gael de Chalendar (Gael.de-Chalendar@cea.fr) +# +########################################################### + +set encoding=utf8 +using modex lima-analysis.xml +using groups LinguisticProcessing + +#---------------------------------------------------------------------- +# microcategories classes +#---------------------------------------------------------------------- +use categoriesClassesDeclaration-eng.txt + +#---------------------------------------------------------------------- +# +#---------------------------------------------------------------------- +@ConjCoord:@Tout (@Tout){0-n}:(@Tout){0-n} @Tout:SYNTACTIC_RELATION: ++!SecondUngovernedBy(trigger.1,left.1,"COORD1") ++!SecondUngovernedBy(trigger.1,right.2,"COORD2") ++CopyIncomingRelationsTo(left.1,right.2,"SUJ_V") +=>AddRelationInGraph() +=AddRelationInGraph() +=AddRelationInGraph() +#=Simplify() +=Simplify() +=Simplify() += + + + + @@ -53,8 +57,9 @@ - - + + + @@ -401,8 +406,9 @@ - + + @@ -470,6 +476,22 @@ + + + + + + + + + + + + + + + + @@ -704,16 +726,21 @@ - + - - - + + + + + + + + @@ -784,6 +811,13 @@ + + + + + + + @@ -950,6 +984,18 @@ + + + + + + + + + + + + diff --git a/lima_linguisticprocessing/conf/lima-lp-fre.xml b/lima_linguisticprocessing/conf/lima-lp-fre.xml index f037fccbb..f47230360 100644 --- a/lima_linguisticprocessing/conf/lima-lp-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-fre.xml @@ -1,9 +1,19 @@ + + + + + + + + + + @@ -19,10 +29,17 @@ + + + + - + + @@ -32,7 +49,7 @@ - + @@ -44,7 +61,7 @@ - + @@ -53,11 +70,16 @@ + + + + + - + @@ -67,17 +89,8 @@ - - - - - - - - - - + @@ -111,7 +124,7 @@ - + @@ -123,11 +136,13 @@ + - + - + + @@ -140,7 +155,7 @@ - + @@ -157,8 +172,10 @@ - - + + + + @@ -173,23 +190,23 @@ - - - - - + + + + + - + - + @@ -309,15 +326,27 @@ - + - + + + + + + + + + + + + + @@ -328,13 +357,14 @@ - + + - + @@ -389,10 +419,10 @@ - @@ -405,11 +435,11 @@ - - - - - + + + + + @@ -464,7 +494,16 @@ - @@ -598,18 +637,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -625,9 +664,41 @@ - + + + + + + + + + + + + + + + + @@ -635,18 +706,39 @@ - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + @@ -656,18 +748,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -678,18 +770,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -699,18 +791,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -720,18 +812,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -752,18 +844,18 @@ - - - - - - - - - + + + + + + + + + - - + + @@ -883,6 +975,9 @@ + + + diff --git a/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml b/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml index d68c430a7..d6d40a984 100644 --- a/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml +++ b/lima_linguisticprocessing/conf/lima-lp-tva-eng.xml @@ -146,8 +146,9 @@ - + + diff --git a/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml b/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml index fe33a8266..c4a238758 100644 --- a/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml +++ b/lima_linguisticprocessing/conf/lima-lp-tva-fre.xml @@ -143,8 +143,9 @@ - + + diff --git a/lima_linguisticprocessing/data/test-eng.default.xml b/lima_linguisticprocessing/data/test-eng.default.xml index 4d15f9838..f23c1712b 100644 --- a/lima_linguisticprocessing/data/test-eng.default.xml +++ b/lima_linguisticprocessing/data/test-eng.default.xml @@ -57,14 +57,19 @@ - + - 24.99 reçoit la categorie num card - 24,99 reçoit la categorie num card + + 24.99 reçoit la categorie num card - + + + The negative numbers in digits -27 and -25 must be normalized into themselves. + + + + + + + + + + diff --git a/lima_linguisticprocessing/data/test-fre.default.xml b/lima_linguisticprocessing/data/test-fre.default.xml index f1538722e..3b6fe7109 100644 --- a/lima_linguisticprocessing/data/test-fre.default.xml +++ b/lima_linguisticprocessing/data/test-fre.default.xml @@ -20,6 +20,16 @@ left="XPATH#//data_structure/vertex[token/position=4]/data/unknown_word//p[@prop='MACRO']/@val" operator="contains" right="NC"/> + + @@ -64,9 +74,14 @@ - 24,99 reçoit la categorie num card + 24,99 gets the DET tag and its lemma is itself + @@ -87,6 +102,7 @@ right="euritrack"/> + E.U.R.I.T.R.A.C.K. est un mot inconnu, doit être normalisé 'euritrack' @@ -94,7 +110,6 @@ - EURITRACK est un mot inconnu, doit être normalisé 'euritrack' 24.99 reçoit la categorie num card + + + + + + + + + + euritrack est un mot inconnu tout en minuscules, doit être normalisé 'euritrack' + + + + 'euri100' is an unknown word starting by lowercase letters and ending by digits ; it must be normalized in itself 'euri100'. + + + + + + + + + + + The number in digits 27 must be normalized into itself. + + + + + + + + + + + The negative numbers in digits -27 and -25 must be normalized into themselves. + + + + + + + + + + diff --git a/lima_linguisticprocessing/data/test-fre.se.xml b/lima_linguisticprocessing/data/test-fre.se.xml index 36d663664..1a2131c08 100644 --- a/lima_linguisticprocessing/data/test-fre.se.xml +++ b/lima_linguisticprocessing/data/test-fre.se.xml @@ -682,22 +682,22 @@ left="XPATH#//specific_entities/specific_entity[position=28][length=12]/type" operator="=" right="DateTime.DATE"/> - + + TIMEX : 25 + + + + + + + + + + @@ -1000,12 +1000,12 @@ right="Numex.NUMEX"/> comment="pourcentage" left="XPATH#//specific_entities/specific_entity[position=37][length=4]/type" operator="=" -right="Numex.NUMEX"/> +right="Numex.NUMBER"/> +right="Numex.NUMBER"/> @@ -1043,5 +1043,34 @@ operator="=" operator="=" right="Numex.NUMEX"/> + + + NUMEX: test consecutive numbers. See issue #50 on github + + + + + + + + + + + + + + diff --git a/lima_linguisticprocessing/data/test-fre.simpleword.xml b/lima_linguisticprocessing/data/test-fre.simpleword.xml index cbb4dc5bf..e2cd613f3 100644 --- a/lima_linguisticprocessing/data/test-fre.simpleword.xml +++ b/lima_linguisticprocessing/data/test-fre.simpleword.xml @@ -26,7 +26,11 @@ test recherche de mot dans le dico : vérification présence nappes, fn : nappe. - + @@ -37,183 +41,187 @@ test recherche de mot dans le dico : vérification présence nappes, fn : napper. - + + + + + + + + + + + eleve + + + + + + + + + + + + éleve + + + + + + + + + + + + elève + + + + + + + + + + + + éléve + + + + + + + + + + + + Frère + + + + + + + + + + + Frere + + + + + + + + + + + Amedée + + + + + + + + + + + marche + + + + + + + + + + + evenement + + + + + + + + + + + createur + + - - - - - - - - - eleve - - - - - - - - - - - - éleve - - - - - - - - - - - - elève - - - - - - - - - - - - éléve - - - - - - - - - - - - Frère - - - - - - - - - - - Frere - - - - - - - - - - - Amedée - - - - - - - - - - - marche - - - - - - - - - - - evenement - - - - - - - - - - - createur - - - @@ -329,23 +337,23 @@ operator="=" right="à-propos"/> - - - - - - - - - - - + + + + + + + + + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h index f70ffd2c8..2e0444d20 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AbstractLinguisticProcessingClient.h @@ -88,7 +88,11 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT AbstractLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const = 0; + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + , Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) const = 0; /** * This function is the same as the previous one but takes a text @@ -99,7 +103,11 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT AbstractLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const = 0; + const std::set& inactiveUnits = std::set() +#ifdef ANTINNO_SPECIFIC + ,Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze +#endif + ) const = 0; }; /** @@ -141,7 +149,7 @@ class AbstractLinguisticProcessingClientFactory : public RegistrableFactory createClient() const = 0; /** * virtual destructor of the LinguisticProcessing client factory diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp index dd57a242b..ff2586b8d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/BowDocumentHandler.cpp @@ -83,7 +83,7 @@ void BowDocumentHandler::endDocument() // read Part( istream, AbstractBoWXMLWriter writer, bool useIterator) // do not use iterator, // std::cout is unused - reader.readBoWDocumentBlock(in, *document, structuredBowHandler, false); + reader.readBoWDocumentBlock(in, *document, structuredBowHandler, false, false); } delete m_bowstream; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp new file mode 100644 index 000000000..5b8b18557 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.cpp @@ -0,0 +1,87 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ + +#include "LTRTextHandler.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +using namespace std; +using namespace Lima::Common::BagOfWords; + +namespace Lima { +namespace LinguisticProcessing { + +LTRTextHandler::LTRTextHandler() + : AbstractTextualAnalysisHandler(),m_ltrstream(),m_ltrtext() +{ +} + + +LTRTextHandler::~LTRTextHandler() +{ +} + +Common::BagOfWords::LTR_Text& LTRTextHandler::getLTRText() +{ + return m_ltrtext; +} + + +void LTRTextHandler::endAnalysis() +{ + // read from completed stream + m_ltrtext.binaryReadFrom(m_ltrstream); +} + + +void LTRTextHandler::startAnalysis() +{ + m_ltrtext.clear(); + // reset stringstream + m_ltrstream.str(""); +} + +void LTRTextHandler::handle(const char* buf, int length) +{ + // store in stream + m_ltrstream.write(buf,length); + //m_writer->handle(buf,length); +} + +void LTRTextHandler::endDocument() +{ +} + +void LTRTextHandler::startDocument(const Common::Misc::GenericDocumentProperties&) +{ +} + +void LTRTextHandler::startNode( const std::string& /*elementName*/, bool /*forIndexing*/ ) +{ +} + +void LTRTextHandler::endNode(const Common::Misc::GenericDocumentProperties& /*props*/) +{ +} + +} // end namespace +} // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h new file mode 100644 index 000000000..26cb3981b --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h @@ -0,0 +1,68 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSINGLTRTEXTHANDLER_H +#define LIMA_LINGUISTICPROCESSINGLTRTEXTHANDLER_H + +#include "AnalysisHandlersExport.h" + +#include "linguisticProcessing/client/AnalysisHandlers/AbstractTextualAnalysisHandler.h" +#include "linguisticProcessing/common/linearTextRepresentation/ltrText.h" +#include "common/Data/DataTypes.h" + +namespace Lima { + +namespace LinguisticProcessing { + +/** + * @brief LTRTextHandler is a handler for LTR text that gives access to the resulting LTRText through an accessor +*/ +class LIMA_ANALYSISHANDLERS_EXPORT LTRTextHandler : public AbstractTextualAnalysisHandler +{ +public: + LTRTextHandler(); + + virtual ~LTRTextHandler(); + + virtual void endAnalysis(); + virtual void handle(const char* buf, int length) ; + virtual void startAnalysis(); + + void startDocument(const Common::Misc::GenericDocumentProperties&); + void endDocument(); + void startNode( const std::string& elementName, bool forIndexing ); + void endNode(const Common::Misc::GenericDocumentProperties& props); + + Common::BagOfWords::LTR_Text& getLTRText(); + + virtual void setOut( std::ostream* /*out*/ ) {} + +private: + std::stringstream m_ltrstream; + Common::BagOfWords::LTR_Text m_ltrtext; +}; + +} + +} + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp index 9a216f4b6..964d8e760 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.cpp @@ -74,7 +74,7 @@ openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* proper void StructuredBoWToBoWDocument:: processSBoWText(const BoWText* boWText, - bool /*unused useIterators*/) + bool /*unused useIterators*/, bool useIndexIterator) { if (! m_inIndexingNode.empty() && m_inIndexingNode.back() && @@ -89,7 +89,7 @@ processSBoWText(const BoWText* boWText, void StructuredBoWToBoWDocument:: processProperties(const Common::Misc::GenericDocumentProperties* properties, - bool /*unused useIterators*/) + bool /*unused useIterators*/, bool /*useIndexIterator*/) { if (m_inIndexingNode.back()) { addProperties(*m_currentDocument,properties); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h index 55a7bf601..32369c4d1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/AnalysisHandlers/StructuredBoWToBoWDocument.h @@ -55,10 +55,10 @@ class LIMA_ANALYSISHANDLERS_EXPORT StructuredBoWToBoWDocument : const std::string& elementName); void processSBoWText(const Common::BagOfWords::BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Common::Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp index 4ef8ecb3d..0f4e2d84b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.cpp @@ -57,7 +57,7 @@ void LinguisticProcessingClientFactory::configureClientFactory( pipelines); } -AbstractProcessingClient* LinguisticProcessingClientFactory::createClient( +std::shared_ptr< AbstractProcessingClient > LinguisticProcessingClientFactory::createClient( const std::string& id) const { LPCLIENTFACTORYLOGINIT; @@ -79,7 +79,7 @@ std::deque LinguisticProcessingClientFactory::getRegisteredFactorie } -LinguisticProcessingClientFactoryFactory* LinguisticProcessingClientFactoryFactory::s_instance=new LinguisticProcessingClientFactoryFactory(); +std::unique_ptr< LinguisticProcessingClientFactoryFactory > LinguisticProcessingClientFactoryFactory::s_instance=std::unique_ptr< LinguisticProcessingClientFactoryFactory >(new LinguisticProcessingClientFactoryFactory()); } // LinguisticProcessing diff --git a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h index 7b208d994..302b95410 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/client/LinguisticProcessingClientFactory.h @@ -67,7 +67,7 @@ friend class Singleton; * ClientFactory must have been configured before this method is called * Use configureClientFactory() method to configure. */ - AbstractProcessingClient* createClient(const std::string& id) const; + std::shared_ptr< AbstractProcessingClient > createClient(const std::string& id) const; /** * @brief show registered clientId @@ -86,14 +86,14 @@ class LIMA_LINGUISTICPROCESSIONGCLIENT_EXPORT LinguisticProcessingClientFactoryF public: ~LinguisticProcessingClientFactoryFactory(){}; - ProcessingClientFactory* createProcessingClientFactory() const + std::shared_ptr< ProcessingClientFactory > createProcessingClientFactory() const { - return new LinguisticProcessingClientFactory(); + return std::shared_ptr< ProcessingClientFactory >(new LinguisticProcessingClientFactory()); } private: LinguisticProcessingClientFactoryFactory():AbstractProcessingClientFactoryFactory("lpFactory"){}; - static LinguisticProcessingClientFactoryFactory* s_instance; + static std::unique_ptr< LinguisticProcessingClientFactoryFactory > s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h index 7854ba75b..fe452acf1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWDocumentHandler.h @@ -59,8 +59,10 @@ class LIMA_BOW_EXPORT AbstractBoWDocumentHandler : public AbstractDocumentHandle const std::string& elementName) = 0; virtual void openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName) = 0; - virtual void processSBoWText(const BoWText* boWText, bool useIterators) = 0; -// virtual void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators) = 0; + virtual void processSBoWText(const BoWText* boWText, bool useIterators, + bool useIndexIterator) = 0; + virtual void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators, + bool useIndexIterator) = 0; virtual void closeSBoWNode() = 0; // virtual void writeDocumentsHeader() = 0; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h index c441840d5..462d4adcf 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/AbstractBoWElement.h @@ -47,7 +47,11 @@ namespace BagOfWords { /** * enum to characterize the type of the AbstractBoWElement */ -typedef enum { +#ifndef WIN32 +enum class BoWType : unsigned short { +#else +enum BoWType { +#endif BOW_NOTYPE, /**< the AbstractBoWElement is an abstract one that should not be instanciated */ BOW_TOKEN, /**< the AbstractBoWElement is a simple token */ @@ -55,7 +59,21 @@ typedef enum { BOW_NAMEDENTITY, /**< the AbstractBoWElement is a named entity */ BOW_PREDICATE, /**< the AbstractBoWElement is a predicate (n-ary relation, template or semantic frame */ -} BoWType; +}; + +uint8_t toInt(const BoWType& bt); + +template +T& operator<<(T& qd, const BoWType& bt) +{ + if (bt == BoWType::BOW_NOTYPE) qd << "BOW_NOTYPE"; + else if (bt == BoWType::BOW_TOKEN) qd << "BOW_TOKEN"; + else if (bt == BoWType::BOW_TERM) qd << "BOW_TERM"; + else if (bt == BoWType::BOW_NAMEDENTITY) qd << "BOW_NAMEDENTITY"; + else if (bt == BoWType::BOW_PREDICATE) qd << "BOW_PREDICATE"; + else qd << "UNDEFINED"; + return qd; +}; /** * This class is the abstract base class of all elements that can be stored in @@ -79,7 +97,7 @@ class LIMA_BOW_EXPORT AbstractBoWElement * a predicate, 1 for a simple token, n for complex tokens */ virtual uint64_t size(void) const = 0; - virtual BoWType getType() const {return BOW_NOTYPE;} + virtual BoWType getType() const {return BoWType::BOW_NOTYPE;} virtual Lima::LimaString getString(void) const = 0; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp index 3a8c511ad..412a107b1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.cpp @@ -65,7 +65,7 @@ openSBoWIndexingNode(const Misc::GenericDocumentProperties* /*properties*/, void BinaryWriterBoWDocumentHandler:: processSBoWText(const BoWText* boWText, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { Common::Misc::writeOneByteInt(m_outputStream,Common::BagOfWords::BOW_TEXT_BLOC); m_writer.writeBoWText(m_outputStream,*boWText); @@ -73,7 +73,7 @@ processSBoWText(const BoWText* boWText, void BinaryWriterBoWDocumentHandler:: processProperties(const Misc::GenericDocumentProperties* properties, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { Common::Misc::writeOneByteInt(m_outputStream,Common::BagOfWords::DOCUMENT_PROPERTIES_BLOC); properties->write(m_outputStream); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h index 9263c2abe..f8dc906ed 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BinaryWriterBoWDocumentHandler.h @@ -54,9 +54,9 @@ class LIMA_BOW_EXPORT BinaryWriterBoWDocumentHandler : public AbstractBoWDocumen void openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, const std::string& elementName); void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h index b1f1cae9a..3a5d3bd31 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/BoWPredicate.h @@ -65,7 +65,7 @@ class LIMA_BOW_EXPORT BoWPredicate : public AbstractBoWElement MediaticData::EntityType getPredicateType(void) const; void setPredicateType(const MediaticData::EntityType&); - virtual BoWType getType() const { return BOW_PREDICATE; } + virtual BoWType getType() const { return BoWType::BOW_PREDICATE; } virtual Lima::LimaString getString(void) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp index f45f705f3..01075d7ed 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.cpp @@ -29,6 +29,8 @@ #include "bowTokenIterator.h" #include "bowToken.h" #include "bowText.h" +#include "indexElementIterator.h" +#include "indexElement.h" #include "common/Data/genericDocumentProperties.h" @@ -42,6 +44,8 @@ class TextWriterBoWDocumentHandlerPrivate TextWriterBoWDocumentHandlerPrivate(std::ostream& os); ~TextWriterBoWDocumentHandlerPrivate(); + + void writeIndexElement(const IndexElement& element); std::ostream& m_outputStream; @@ -84,7 +88,7 @@ openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, void TextWriterBoWDocumentHandler:: processSBoWText(const BoWText* boWText, - bool useIterators) + bool useIterators, bool useIndexIterator) { if (useIterators) { BoWTokenIterator it(*boWText); @@ -93,6 +97,14 @@ processSBoWText(const BoWText* boWText, it++; } } + else if (useIndexIterator) { + IndexElementIterator it(*boWText); + while (! it.isAtEnd()) + { + m_d->writeIndexElement(it.getElement()); + it++; + } + } else { m_d->m_outputStream << *boWText; } @@ -100,7 +112,7 @@ processSBoWText(const BoWText* boWText, void TextWriterBoWDocumentHandler:: processProperties(const Misc::GenericDocumentProperties* /*properties*/, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) { //os << *properties; } @@ -110,6 +122,54 @@ closeSBoWNode() { } +void TextWriterBoWDocumentHandlerPrivate::writeIndexElement( + const IndexElement& element) { +// m_outputStream << "" << endl; +// return; +// } +// if (element.isSimpleTerm()) { +// std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); +// +// m_outputStream << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) +// << "\" category=\"" << cat +// << "\" position=\"" << element.getPosition() +// << "\" length=\"" << element.getLength() << "\""; +// if (element.isNamedEntity()) { +// m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; +// m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; +// } +// else { +// m_outputStream << " type=\"" << BOW_TOKEN << "\""; +// } +// m_outputStream << "/>" << endl; +// return; +// } +// +// // compound +// if (element.isNamedEntity()) { +// m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; +// m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; +// } +// else { +// m_outputStream << " type=\"" << BOW_TERM << "\""; +// } +// m_outputStream << ">" << endl +// << " " << endl; +// +// for (uint64_t i(0),size=element.getStructure().size(); i" << endl; +// +// } +// m_outputStream << " " << endl +// << "" << endl; +} + } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h index fd6ba24a3..93edf3519 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/TextWriterBoWDocumentHandler.h @@ -57,9 +57,9 @@ class LIMA_BOW_EXPORT TextWriterBoWDocumentHandler : public AbstractBoWDocumentH void openSBoWIndexingNode(const Misc::GenericDocumentProperties* properties, const std::string& elementName); void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool useIndexIterator); void processProperties(const Misc::GenericDocumentProperties* properties, - bool useIterators); + bool useIterators, bool useIndexIterator); void closeSBoWNode(); private: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp index 13f87a833..ee09af42b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.cpp @@ -137,14 +137,14 @@ void BoWBinaryReader::readHeader(std::istream& file) #ifdef DEBUG_LP BOWLOGINIT; - LDEBUG << "BoWBinaryReader::readHeader type mapping is"; + LDEBUG << "BoWBinaryReader::readHeader type mapping is (shown if logger = TRACE)"; std::ostringstream oss; for (std::map::const_iterator it=m_d->m_entityTypeMapping.begin(),it_end=m_d->m_entityTypeMapping.end(); it!=it_end; it++) { oss << (*it).first << " -> " << (*it).second << std::endl; } - LDEBUG << oss.str(); + LTRACE << oss.str(); LDEBUG << "BoWBinaryReader::readHeader end file at: " << file.tellg(); #endif @@ -193,9 +193,14 @@ void BoWBinaryReader::readBoWText(std::istream& file, void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, BoWDocument& document, AbstractBoWDocumentHandler& handler, - bool useIterator) + bool useIterator, + bool useIndexIterator) { BoWBlocType blocType = static_cast( Misc::readOneByteInt(file) ); +#ifdef ANTINNO_BUGFIX + if (file.eof()) + return; +#endif #ifdef DEBUG_LP BOWLOGINIT; LDEBUG << "BoWBinaryReader::readBoWDocumentBlock: read blocType" << blocType; @@ -230,7 +235,7 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, #endif document.clear(); readBoWText(file,document); - handler.processSBoWText(&document, useIterator); + handler.processSBoWText(&document, useIterator, useIndexIterator); break; } case NODE_PROPERTIES_BLOC: @@ -239,7 +244,7 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, LDEBUG << "NODE_PROPERTIES_BLOC"; #endif document.Misc::GenericDocumentProperties::read(file); - handler.processProperties(&document, useIterator); + handler.processProperties(&document, useIterator, useIndexIterator); break; } case END_BLOC: @@ -264,6 +269,11 @@ void BoWBinaryReader::readBoWDocumentBlock(std::istream& file, break; } default:; +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LERROR << "MultimediaBinaryReaderIndexer::readMultimediaDocumentBlock: unmanaged block type " << blocType; +#endif +#endif } } @@ -283,19 +293,19 @@ boost::shared_ptr< AbstractBoWElement > BoWBinaryReaderPrivate::readBoWToken( st #endif boost::shared_ptr< AbstractBoWElement > token; switch (type) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { token=boost::shared_ptr< BoWToken >( new BoWToken); readSimpleToken(file, boost::dynamic_pointer_cast(token)); break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { token=boost::shared_ptr< BoWTerm >(new BoWTerm); // LDEBUG << "BoWToken: calling read(file) on term"; readSimpleToken(file,boost::dynamic_pointer_cast(token)); readComplexTokenParts(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { token=boost::shared_ptr< BoWNamedEntity >(new BoWNamedEntity); // LDEBUG << "BoWToken: calling read(file) on NE"; readSimpleToken(file,boost::dynamic_pointer_cast(token)); @@ -303,7 +313,7 @@ boost::shared_ptr< AbstractBoWElement > BoWBinaryReaderPrivate::readBoWToken( st readNamedEntityProperties(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ token=boost::shared_ptr< BoWPredicate >(new BoWPredicate); readPredicate(file,boost::dynamic_pointer_cast(token)); break; @@ -340,6 +350,15 @@ void BoWBinaryReaderPrivate::readSimpleToken(std::istream& file, Misc::readUTF8StringField(file,inflectedForm); #ifdef DEBUG_LP LDEBUG << "BoWBinaryReader::readSimpleToken read infl: " << inflectedForm; +#endif +#ifdef ANTINNO_SPECIFIC + if (lemma.isEmpty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::readSimpleToken empty lemma, using inflected form instead:" << inflectedForm; +#endif + lemma = inflectedForm; + } #endif LinguisticCode category; uint64_t position,length; @@ -548,7 +567,7 @@ void BoWBinaryWriter::writeBoWDocument(std::ostream& file, { BOWLOGINIT; LERROR << "BoWBinaryWriter: writeBoWDocument non implemented"; - LERROR << "Can not write "<< doc << " into "<< file; + LERROR << "Can not write "<< doc << " into stream"<< &file; } void BoWBinaryWriter::writeBoWToken(std::ostream& file, @@ -563,25 +582,25 @@ void BoWBinaryWriterPrivate::writeBoWToken( std::ostream& file, const boost::sha BOWLOGINIT; LDEBUG << "BoWBinaryWriter::writeBoWToken token type is " << token->getType() << &file; #endif - Misc::writeOneByteInt(file,token->getType()); + Misc::writeOneByteInt(file,toInt(token->getType())); switch (token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { writeSimpleToken(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { writeSimpleToken(file,boost::dynamic_pointer_cast(token)); writeComplexTokenParts(file,boost::dynamic_pointer_cast(token)); break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { boost::shared_ptr< BoWNamedEntity > ne=boost::dynamic_pointer_cast(token); writeSimpleToken(file,boost::dynamic_pointer_cast(token)); writeComplexTokenParts(file,ne); writeNamedEntityProperties(file,ne); break; } - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ writePredicate(file,boost::dynamic_pointer_cast(token)); break; } @@ -612,6 +631,60 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, #endif Misc::writeUTF8StringField(file,token->getInflectedForm()); Misc::writeCodedInt(file,token->getCategory()); + +#ifdef ANTINNO_SPECIFIC + + // FWI 04/08/2016 : correction de length qui ne tient pas compte des entitées xml dans le lemme + auto beg = token->getPosition(); + auto end = token->getLength() + beg; + //::std::cout << "beg: " << beg << " end: " << end << ::std::endl; + + if (m_shiftFrom.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom is empty"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin" << beg; + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end" << end; +#endif + auto const it1 = m_shiftFrom.lowerBound(beg-1); + if (it1 == m_shiftFrom.constBegin()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: NO shift"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from begin: shift by" << (it1-1).value(); +#endif + beg += (it1-1).value(); + } + auto const it2 = m_shiftFrom.lowerBound(end-1); + if (it2 == m_shiftFrom.constBegin()) + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: NO shift"; +#endif + } + else + { +#ifdef DEBUG_LP + LDEBUG << "BoWBinaryWriter::writeSimpleToken shiftFrom from end: shift by" << (it2-1).value(); +#endif + end += (it2-1).value(); + } + } + + Misc::writeCodedInt(file, beg-1); + Misc::writeCodedInt(file, end-beg); + +#else if (m_shiftFrom.empty()) { #ifdef DEBUG_LP @@ -641,6 +714,7 @@ void BoWBinaryWriterPrivate::writeSimpleToken(std::ostream& file, } } Misc::writeCodedInt(file,token->getLength()); +#endif } void BoWBinaryWriter::writePredicate(std::ostream& file, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h index 19408baf6..5893d1947 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowBinaryReaderWriter.h @@ -79,7 +79,8 @@ class LIMA_BOW_EXPORT BoWBinaryReader void readBoWDocumentBlock(std::istream& file, BoWDocument& document, AbstractBoWDocumentHandler& handler, - bool useIterator=false); + bool useIterator, + bool useIndexIterator); boost::shared_ptr< Lima::Common::BagOfWords::AbstractBoWElement > readBoWToken(std::istream& file); void readSimpleToken(std::istream& file, boost::shared_ptr< BoWToken > token); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp index 24a3a665a..a577e852d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowComplexToken.cpp @@ -237,8 +237,25 @@ boost::shared_ptr< BoWToken > BoWComplexTokenPrivate::addPart(boost::shared_ptr< if (isHead) { m_head=m_parts.size()-1; } - if (tok->getPosition() < m_position) m_position = tok->getPosition(); - if (tok->getPosition() > (m_position + m_length)) m_length = (tok->getPosition()+tok->getLength()-m_position-1); + uint64_t previousPosition = m_position; + + // added the first part + if (m_position == 0 && m_length==0) + { + m_position = tok->getPosition(); + m_length = tok->getLength(); + } + // adding a part before the previous first part + else if (tok->getPosition() < m_position) + { + m_position = tok->getPosition(); + m_length = previousPosition - tok->getPosition() + m_length; + } + // adding a part after the current end + else if (tok->getPosition() > (previousPosition + m_length)) + { + m_length = tok->getPosition() - previousPosition + tok->getLength(); + } return tok; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp index 7eaf085a7..e9211f43f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowDocument.cpp @@ -93,10 +93,6 @@ BoWDocument::BoWDocument(const BoWDocument& d): //*********************************************************************** BoWDocument::~BoWDocument() { -#ifdef DEBUG_LP - BOWLOGINIT; - LDEBUG << "BoWDocument::~BoWDocument" << this; -#endif clear(); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h index 60e90dcc9..44d82ee6a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowNamedEntity.h @@ -76,7 +76,7 @@ class LIMA_BOW_EXPORT BoWNamedEntity : public BoWComplexToken void setFeature(const std::string& attribute, const LimaString& value); - virtual BoWType getType() const { return BOW_NAMEDENTITY; } + virtual BoWType getType() const { return BoWType::BOW_NAMEDENTITY; } /** * get a string of the features, of the kind : @@ -84,7 +84,7 @@ class LIMA_BOW_EXPORT BoWNamedEntity : public BoWComplexToken */ std::string getFeaturesUTF8String(void) const; /** get a string of the BoWToken for output function */ - virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager) const; + virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager = 0) const; virtual std::string getIdUTF8String(void) const; /** diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h index 436dc32f9..9bb94c7f9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTerm.h @@ -65,7 +65,7 @@ class LIMA_BOW_EXPORT BoWTerm : public BoWComplexToken BoWTerm& operator=(const BoWTerm&); - virtual BoWType getType() const { return BOW_TERM; } + virtual BoWType getType() const { return BoWType::BOW_TERM; } /** get a string of the BoWToken for output function */ virtual std::string getOutputUTF8String(const Common::PropertyCode::PropertyManager* macroManager = 0) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp index e4ca102c9..3486b2049 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowText.cpp @@ -80,8 +80,6 @@ BoWText& BoWText::operator = (const BoWText& t) BoWText::~BoWText() { - BOWLOGINIT; - LDEBUG << "BoWText::~BoWText()" << this; clear(); } void BoWText::writeBoWText(ostream& stream) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp index 7fbe39a9c..547ad60f9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.cpp @@ -52,6 +52,17 @@ namespace Common namespace BagOfWords { +uint8_t toInt(const BoWType& bt) +{ + if (bt == BoWType::BOW_NOTYPE) return 0; + else if (bt == BoWType::BOW_TOKEN) return 1; + else if (bt == BoWType::BOW_TERM) return 2; + else if (bt == BoWType::BOW_NAMEDENTITY) return 3; + else if (bt == BoWType::BOW_PREDICATE) return 4; + else return std::numeric_limits::max(); +} + + #define DEFAULT_SEPARATOR L'#' #define DEFAULT_COMPOUND_SEPARATOR L'_' @@ -262,8 +273,6 @@ BoWToken* BoWToken::clone() const //*********************************************************************** BoWToken::~BoWToken() { - BOWLOGINIT; - LDEBUG << "BoWToken::~BoWToken " << this; delete m_d; } @@ -287,7 +296,7 @@ void BoWToken::setCategory(LinguisticCode c) {m_d->m_category = c;}; void BoWToken::setPosition(const uint64_t pos){m_d->m_position = pos;}; void BoWToken::setLength(const uint64_t len) {m_d->m_length = len;}; -BoWType BoWToken::getType() const { return BOW_TOKEN; } +BoWType BoWToken::getType() const { return BoWType::BOW_TOKEN; } uint64_t BoWToken::getVertex() const {return m_d->m_vertex;} void BoWToken::setVertex(uint64_t vertex) {m_d->m_vertex = vertex;} @@ -328,10 +337,10 @@ LimaString BoWToken::getString(void) const if (m_d->m_useOnlyLemma) { -//#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'true'"; - LDEBUG << "BoWToken::getString: getLemma()=" << getLemma(); -//#endif +// #ifdef DEBUG_LP +// LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'true'"; +// LDEBUG << "BoWToken::getString: getLemma()=" << getLemma(); +// #endif return getLemma(); } else @@ -339,13 +348,13 @@ LimaString BoWToken::getString(void) const ostringstream cat; cat << m_d->m_category; //#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'false'"; +// LDEBUG << "BoWToken::getString: m_d->m_useOnlyLemma is 'false'"; //#endif /* ostringstream len; len << m_length; return m_lemma + m_separator + LimaString(cat.str()) + m_separator + LimaString(len.str());*/ //#ifdef DEBUG_LP - LDEBUG << "BoWToken::getString: getLemma()=" << getLemma() << ", cat=" << Misc::utf8stdstring2limastring(cat.str() ); +// LDEBUG << "BoWToken::getString: getLemma()=" << getLemma() << ", cat=" << Misc::utf8stdstring2limastring(cat.str() ); //#endif return getLemma() + m_d->m_separator + Misc::utf8stdstring2limastring(cat.str()); } @@ -384,7 +393,7 @@ Common::Misc::PositionLengthList BoWToken::getPositionLengthList() const bool BoWToken::operator==(const BoWToken& t) const { - if ((getType()==BOW_NAMEDENTITY) && (t.getType()==BOW_NAMEDENTITY)) + if ((getType()==BoWType::BOW_NAMEDENTITY) && (t.getType()==BoWType::BOW_NAMEDENTITY)) { const BoWNamedEntity* n1=dynamic_cast(this); const BoWNamedEntity* n2=dynamic_cast(&t); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h index 45e92a7e9..574c8d4e6 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowToken.h @@ -158,9 +158,17 @@ class LIMA_BOW_EXPORT BoWToken : public AbstractBoWElement */ virtual void addToPosition(const uint64_t offset); +#ifdef ANTINNO_SPECIFIC + friend LIMA_BOW_EXPORT ::std::ostream& ::Lima::Common::Misc::operator << (::std::ostream& os, +#else friend LIMA_BOW_EXPORT std::ostream& operator << (std::ostream& os, +#endif const Common::Misc::PositionLengthList& p); +#ifdef ANTINNO_SPECIFIC + friend LIMA_BOW_EXPORT QDebug& ::Lima::Common::Misc::operator << (QDebug& os, +#else friend LIMA_BOW_EXPORT QDebug& operator << (QDebug& os, +#endif const Common::Misc::PositionLengthList& p); static void setUseOnlyLemma(const bool b); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp index 061c7e08a..83dc3534c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowTokenIterator.cpp @@ -191,13 +191,13 @@ boost::shared_ptr< AbstractBoWElement > BoWTokenIterator::getElement() { } else { switch ((*m_d->m_iterator)->getType()) { - case BOW_PREDICATE: - case BOW_TOKEN: { + case BoWType::BOW_PREDICATE: + case BoWType::BOW_TOKEN: { return *m_d->m_iterator; break; } - case BOW_TERM: - case BOW_NAMEDENTITY: { + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: { // element itself will be stored in queue as part m_d->storePartsInQueue(boost::dynamic_pointer_cast< BoWToken >(*m_d->m_iterator)); return m_d->m_partQueue.front().getBoWToken(); @@ -282,7 +282,7 @@ bool BoWTokenIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWToken return false; } // addInPartQueue(token,false); - if (token->getType()==BOW_NAMEDENTITY + if (token->getType()==BoWType::BOW_NAMEDENTITY && m_iterateThroughNamedEntitiesParts==DO_NOT_ITERATE_THROUGH_NAMEDENTITIES_PARTS) { PartTokens pt; @@ -292,14 +292,14 @@ bool BoWTokenIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWToken } switch (token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { // push simple token in parts partTokens.push_back(PartTokens()); partTokens.back().push_back(token); break; } - case BOW_TERM: - case BOW_NAMEDENTITY: { + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: { boost::shared_ptr< BoWComplexToken > complexToken=boost::dynamic_pointer_cast(token); if (complexToken->size() == 1) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp index 2173caea0..a6ab3d1a0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.cpp @@ -57,6 +57,11 @@ class BoWXMLWriterPrivate friend class BoWXMLWriter; BoWXMLWriterPrivate(std::ostream& os); +// FWI 08/09/2015 : ajout de la langue en paramètre +#ifdef ANTINNO_SPECIFIC + BoWXMLWriterPrivate(std::ostream& os, Lima::MediaId const& language); +#else +#endif virtual ~BoWXMLWriterPrivate(); @@ -75,8 +80,8 @@ friend class BoWXMLWriter; void writeBoWRelation(const BoWRelation* relation); void writeComplexTokenParts(const BoWComplexToken* token); void writeBoWTokenList(const BoWText* text, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); void writeGenericDocumentProperties(const Misc::GenericDocumentProperties* prop); void writePredicateRoles(const BoWPredicate* term); template @@ -107,6 +112,18 @@ m_language(0) { } +// FWI 08/09/2015 : ajout de la langue en paramètre +#ifdef ANTINNO_SPECIFIC +BoWXMLWriterPrivate::BoWXMLWriterPrivate(std::ostream& os, Lima::MediaId const& language): +m_outputStream(os), +m_currentTokId(0), +m_spaces(""), +m_language(language) +{ +} +#else +#endif + BoWXMLWriterPrivate::~BoWXMLWriterPrivate() { } @@ -119,6 +136,15 @@ BoWXMLWriter::BoWXMLWriter(std::ostream& os): { } +// FWI 08/09/2015 : ajout de la langue en paramètre +#ifdef ANTINNO_SPECIFIC +BoWXMLWriter::BoWXMLWriter(std::ostream& os, Lima::MediaId const& language): +m_d(new BoWXMLWriterPrivate(os, language)) +{ +} +#else +#endif + BoWXMLWriter::~BoWXMLWriter() { delete m_d; @@ -159,14 +185,16 @@ void BoWXMLWriter::closeSBoWNode() { m_d->decIndent(); } -void BoWXMLWriter::processSBoWText( const BoWText* boWText, bool useIterator) { +void BoWXMLWriter::processSBoWText( const BoWText* boWText, bool useIterator, + bool useIndexIterator) { m_d->m_language = Common::MediaticData::MediaticData::single().getMediaId ( boWText->lang ); - m_d->writeBoWTokenList(boWText,useIterator); + m_d->writeBoWTokenList(boWText,useIterator,useIndexIterator); } void BoWXMLWriter::processProperties( - const Misc::GenericDocumentProperties* properties, bool /*unused useIterators*/) { + const Misc::GenericDocumentProperties* properties, bool /*unused useIterators*/, + bool /*useIndexIterator*/) { m_d->writeGenericDocumentProperties(properties); } @@ -350,31 +378,78 @@ void BoWXMLWriterPrivate::writeIndexElement( m_outputStream << "/>" << endl; return; } + if (element.isSimpleTerm()) { std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); m_outputStream << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilité avec la box + << "\" category=\"" << element.getCategory() + << "\" categoryString=\"" << cat // uniquement pour info +#else << "\" category=\"" << cat +#endif << "\" position=\"" << element.getPosition() << "\" length=\"" << element.getLength() << "\""; if (element.isNamedEntity()) { +#ifdef ANTINNO_SPECIFIC + string const neTypeAsString = Common::Misc::limastring2utf8stdstring(MediaticData::MediaticData::single().getEntityName(element.getNamedEntityType())); + m_outputStream << " neType=\"" << element.getNamedEntityType()/*xmlString(neTypeAsString)*/ << "\""; + m_outputStream << " neTypeString=\"" << xmlString(neTypeAsString) << "\""; + m_outputStream << " type=\"" << static_cast(BoWType::BOW_NAMEDENTITY) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#else m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; - m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; + m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#endif } else { - m_outputStream << " type=\"" << BOW_TOKEN << "\""; +#ifdef ANTINNO_SPECIFIC + m_outputStream << " type=\"" << static_cast(BoWType::BOW_TOKEN) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_TOKEN << "\""; +#else + m_outputStream << " type=\"" << BoWType::BOW_TOKEN << "\""; +#endif } m_outputStream << "/>" << endl; return; } // compound + +#ifdef ANTINNO_SPECIFIC + std::string const cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(element.getCategory())); + // FWI 15/09/2015 : ajout pour info de la chaîne même pour les mots composés + // + hack pour garder la compatibilité avec la box + m_outputStream + << " lemma=\"" << xmlString(Common::Misc::limastring2utf8stdstring(element.getSimpleTerm())) << "\"" + << " category=\"" << element.getCategory() << "\"" + << " categoryString=\"" << xmlString(cat) << "\"" /* uniquement pour info */ + << " position=\"" << element.getPosition() << "\"" + << " length=\"" << element.getLength() << "\""; +#endif + if (element.isNamedEntity()) { + +#ifdef ANTINNO_SPECIFIC + string const neTypeAsString = Common::Misc::limastring2utf8stdstring(MediaticData::MediaticData::single().getEntityName(element.getNamedEntityType())); + m_outputStream << " neType=\"" << element.getNamedEntityType() /*xmlString(neTypeAsString)*/ << "\""; + m_outputStream << " neType=\"" << xmlString(neTypeAsString) << "\""; + m_outputStream << " type=\"" << static_cast(BoWType::BOW_NAMEDENTITY) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#else m_outputStream << " neType=\"" << element.getNamedEntityType() << "\""; - m_outputStream << " type=\"" << BOW_NAMEDENTITY << "\""; + m_outputStream << " type=\"" << BoWType::BOW_NAMEDENTITY << "\""; +#endif } else { - m_outputStream << " type=\"" << BOW_TERM << "\""; +#ifdef ANTINNO_SPECIFIC + m_outputStream << " type=\"" << static_cast(BoWType::BOW_TERM) << "\""; + m_outputStream << " typeString=\"" << BoWType::BOW_TERM << "\""; +#else + m_outputStream << " type=\"" << BoWType::BOW_TERM << "\""; +#endif } m_outputStream << ">" << endl << m_spaces << " " << endl; @@ -403,20 +478,26 @@ void BoWXMLWriterPrivate::writeBoWToken( m_currentTokId++; const BoWToken* tok = 0; switch(token->getType()) { - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { tok = static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(tok->getCategory())); m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilité avec la box + << "\" category=\"" << tok->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << tok->getPosition() << "\" length=\"" << tok->getLength() << "\"" << "/>" << std::endl; break; } - case BOW_PREDICATE: { + case BoWType::BOW_PREDICATE: { const BoWPredicate* term=static_cast(token); m_outputStream <" << std::endl; break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { const BoWTerm* term=static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(term->getCategory())); m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilité avec la box + << "\" category=\"" << term->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << term->getPosition() << "\" length=\"" << term->getLength() << "\"" << ">" << std::endl; @@ -447,14 +534,20 @@ void BoWXMLWriterPrivate::writeBoWToken( m_outputStream <" << std::endl; break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { const BoWNamedEntity* ne=static_cast(token); std::string cat = static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(static_cast(ne->getCategory())); m_outputStream <getLemma())) +#ifdef ANTINNO_SPECIFIC +// FWI 09/09/2015 hack pour garder la compatibilité avec la box + << "\" category=\"" << ne->getCategory() + << "\" categoryString=\"" << xmlString(cat) // uniquement pour info +#else << "\" category=\"" << cat +#endif <<"\" position=\"" << ne->getPosition() << "\" length=\"" << ne->getLength() << "\" type=\"" diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h index b56a8ceb0..a6936be25 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/bowXMLWriter.h @@ -53,16 +53,21 @@ class IndexElement; class LIMA_BOW_EXPORT BoWXMLWriter : public AbstractBoWDocumentHandler { public: - BoWXMLWriter(std::ostream& os); - virtual ~BoWXMLWriter(); +// FWI 08/09/2015 : ajout de la langue en paramètre optionnel +#ifdef ANTINNO_SPECIFIC + BoWXMLWriter(std::ostream& os, Lima::MediaId const& language); +#else +#endif + BoWXMLWriter(std::ostream& os); + virtual ~BoWXMLWriter(); void writeBoWText(const BoWText* document, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); void writeBoWToken(const BoWToken* token); void writeBoWDocument(const BoWDocument* document, - const bool useIterator=false, - const bool useIndexIterator=false); + const bool useIterator, + const bool useIndexIterator); // root tags for valid XML if several documents void writeBoWDocumentsHeader(); @@ -73,8 +78,10 @@ class LIMA_BOW_EXPORT BoWXMLWriter : public AbstractBoWDocumentHandler // Implementation of AbstractBoWXMLWriter functions void openSBoWNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName); void openSBoWIndexingNode(const Lima::Common::Misc::GenericDocumentProperties* properties, const std::string& elementName); - void processSBoWText(const BoWText* boWText, bool useIterators); - void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators); + void processSBoWText(const BoWText* boWText, bool useIterators, + bool useIndexIterator); + void processProperties(const Misc::GenericDocumentProperties* properties, bool useIterators, + bool useIndexIterator); void closeSBoWNode(); void writeIndexElement(const IndexElement& element); void setSpaces(const std::string& s); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp index 8cf7f87cc..4249f7b87 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElement.cpp @@ -82,7 +82,7 @@ class IndexElementPrivate IndexElementPrivate::IndexElementPrivate(): m_id(0), -m_type(BOW_NOTYPE), +m_type(BoWType::BOW_NOTYPE), m_word(), m_category(0), m_position(0), @@ -238,11 +238,11 @@ uint64_t IndexElement::getId() const { return m_d->m_id; } Lima::Common::BagOfWords::BoWType IndexElement::getType() const { return m_d->m_type; } -bool IndexElement::isSimpleTerm() const { return m_d->m_type == BOW_TOKEN; } +bool IndexElement::isSimpleTerm() const { return m_d->m_type == BoWType::BOW_TOKEN || (m_d->m_type == BoWType::BOW_NAMEDENTITY && m_d->m_structure.empty()); } -bool IndexElement::isComposedTerm() const { return m_d->m_type == BOW_TERM; } +bool IndexElement::isComposedTerm() const { return m_d->m_type == BoWType::BOW_TERM || (m_d->m_type == BoWType::BOW_NAMEDENTITY && ! m_d->m_structure.empty()); } -bool IndexElement::isPredicate() const { return m_d->m_type == BOW_PREDICATE; } +bool IndexElement::isPredicate() const { return m_d->m_type == BoWType::BOW_PREDICATE; } const LimaString& IndexElement::getSimpleTerm() const { return m_d->m_word; } @@ -252,7 +252,7 @@ uint64_t IndexElement::getPosition() const { return m_d->m_position; } uint64_t IndexElement::getLength() const { return m_d->m_length; } -bool IndexElement::isNamedEntity() const { return m_d->m_type == BOW_NAMEDENTITY; } +bool IndexElement::isNamedEntity() const { return m_d->m_type == BoWType::BOW_NAMEDENTITY; } const Common::MediaticData::EntityType& IndexElement::getNamedEntityType() const { return m_d->m_neType; } @@ -348,9 +348,14 @@ std::ostream& operator<<(std::ostream& os, const IndexElement& elt) os << "[IndexElement" << elt.m_d->m_id << "," << elt.m_d->m_type ; if (elt.isSimpleTerm()) { os << ":" << Common::Misc::limastring2utf8stdstring(elt.m_d->m_word); +#ifdef ANTINNO_SPECIFIC + // affichage systématique + os << "/" << elt.m_d->m_category; +#else if (elt.m_d->m_category != 0) { os << "/" << elt.m_d->m_category; } +#endif os << "/" << elt.m_d->m_position; os << "," << elt.m_d->m_length; } @@ -366,9 +371,12 @@ std::ostream& operator<<(std::ostream& os, const IndexElement& elt) os << "," << elt.m_d->m_structure[i] << " RE(" << elt.m_d->m_relations[i] << ")"; i++; } + os << "]"; } - os << "/"; - ::operator<<(os,elt.m_d->m_poslenlist); + // FWI 20/02/2015 + //os << "/"; + //::operator<<(os,elt.m_d->m_poslenlist); + os << "/" << elt.m_d->m_poslenlist; } if (! elt.m_d->m_neType.isNull()) { os << "/NE(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(elt.m_d->m_neType).toUtf8().constData() << ")"; @@ -434,8 +442,10 @@ QTextStream& operator<<(QTextStream& os, const IndexElement& elt) { i++; } } - os << "/"; - ::operator<<(os,elt.m_d->m_poslenlist); + // FWI 20/02/2015 + //os << "/"; + //::operator<<(os,elt.m_d->m_poslenlist); + os << "/" << elt.m_d->m_poslenlist; } if (! elt.m_d->m_neType.isNull()) { os << "/NE(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(elt.m_d->m_neType) << ")"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp index 91d9cbc0b..f565c5480 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/indexElementIterator.cpp @@ -26,6 +26,7 @@ #include "indexElement.h" #include "linguisticProcessing/common/BagOfWords/BoWRelation.h" +#include "linguisticProcessing/common/BagOfWords/bowTerm.h" #include "linguisticProcessing/common/BagOfWords/BoWPredicate.h" #include "indexElementIterator.h" #include "common/FsaAccess/AbstractLexiconIdGenerator.h" @@ -55,37 +56,39 @@ class IndexElementIteratorPrivate IndexElementIteratorPrivate(const IndexElementIteratorPrivate& ieip); ~IndexElementIteratorPrivate(); - typedef std::deque IndexElementQueue; - // members - BoWText::const_iterator m_iterator; - BoWText::const_iterator m_iteratorEnd; - IndexElementQueue m_partQueue; - uint64_t m_maxSizeQueue; - uint64_t m_maxCompoundSize; - AbstractLexiconIdGenerator* m_idGenerator; - - // private functions - - // add in queue - // (return false if size of queue becomes greater than max) - bool addInPartQueue(const uint64_t id, - const BoWType type, - const LimaString& word, - const uint64_t cat, - const uint64_t position, - const uint64_t length, - const Common::MediaticData::EntityType neType); + /** */ void getPositionLengthList(const std::vector& structure, Misc::PositionLengthList& poslenlist) const; - // add in queue: only used for compound elements + + /** Add @ref newElement in queue, only if queue size is lower than its maximum. + * Only used for compound elements + * @return true if the element has been added and false otherwise (size of queue would become + * greater than max) + */ bool addInPartQueue(const IndexElement& newElement); - void storePartsInQueue(boost::shared_ptr< BoWToken > token, const uint64_t rel); + /** Calls addPartElementsInQueue to recursively add @ref token parts and itself in the queue + */ + void storePartsInQueue(boost::shared_ptr< BoWToken > token); bool addPartElementsInQueue(boost::shared_ptr< BoWToken > token, std::pair, uint64_t> & ids_rels, const uint64_t rel); + + /** + * this function is recursive to build all composed elements that contains + * the head and all or parts of the extensions, for all possible values (ids) + * of head and extensions + * + * @param partIdsRels : the possible ids of each part, plus one relation per part + * @param head : the position of the head in the parts + * @param ids : the id list in which new ids are added for combined element + * @param structure : the current structure + * @param i : the current part looked at + * + * @return + */ bool addCombinedPartsInQueue(const Lima::Common::BagOfWords::BoWType type, const std::vector, uint64_t> >& partIds_Rels, const uint64_t head, @@ -95,6 +98,16 @@ class IndexElementIteratorPrivate std::vector& relations, const uint64_t i); + typedef std::deque IndexElementQueue; + + // members + BoWText::const_iterator m_iterator; + BoWText::const_iterator m_iteratorEnd; + IndexElementQueue m_partQueue; + uint64_t m_maxSizeQueue; + uint64_t m_maxCompoundSize; + AbstractLexiconIdGenerator* m_idGenerator; + QMap m_alreadyFoundElements; }; IndexElementIteratorPrivate::IndexElementIteratorPrivate(const BoWText& bowText, @@ -168,64 +181,109 @@ bool IndexElementIterator::isAtEnd() const // get current element ("dereference" iterator) //********************************************************************** // getting parts is done in this function (rather than in ++ function): -// which means that is a ++ is done before calling a getElement on +// which means that if a ++ is done before calling a getElement on // a complex token, no parts will be explored IndexElement IndexElementIterator::getElement() { +#ifdef DEBUG_CD BOWLOGINIT; - + LDEBUG << "IndexElementIterator::getElement empty:" << m_d->m_partQueue.empty(); +#endif + // If queue is empty + // - for simple tokens: a new index element is returned + // - for complex tokens : it is filled and then its front is returned if (m_d->m_partQueue.empty()) { if (m_d->m_iterator==m_d->m_iteratorEnd) { // at end +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement at end: return empty element"; +#endif return IndexElement(); // empty element has id 0 } else { - boost::shared_ptr< BoWToken> token; + boost::shared_ptr< BoWToken> token = boost::dynamic_pointer_cast((*m_d->m_iterator)); boost::shared_ptr< BoWPredicate > predicate; + switch ((*m_d->m_iterator)->getType()) { - case BOW_TOKEN: - { - token = boost::dynamic_pointer_cast((*m_d->m_iterator)); - uint64_t id=m_d->m_idGenerator->getId(token->getString()); - return IndexElement(id, - token->getType(), - token->getLemma(), - token->getCategory(), - token->getPosition(), - token->getLength() - ); - } - case BOW_TERM: - case BOW_NAMEDENTITY: - LDEBUG << "IndexElementIterator::getElement BOW_NAMEDENTITY" /*<< * (static_cast((*m_d->m_iterator)) ) << Lima::Common::MediaticData::MediaticData::single().getEntityName(static_cast((*m_d->m_iterator))->getNamedEntityType())*/; - // element itself will be stored in queue as part - m_d->storePartsInQueue(boost::dynamic_pointer_cast(*m_d->m_iterator),0); - return m_d->m_partQueue.front(); - // FIXME Change the handling of predicates to take into account their complex structure nature - case BOW_PREDICATE: - { - predicate = boost::dynamic_pointer_cast((*m_d->m_iterator)); - uint64_t id=m_d->m_idGenerator->getId(predicate->getString()); - return IndexElement(id, - predicate->getType(), - predicate->getString(), - 0, - predicate->getPosition(), - predicate->getLength(), - predicate->getPredicateType() - ); - } - case BOW_NOTYPE: - ; + case BoWType::BOW_TOKEN: + { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement simple token:" << token->getIdUTF8String(); +#endif + if (!m_d->m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()), + IndexElement(m_d->m_idGenerator->getId(token->getString()), + token->getType(), + token->getLemma(), + token->getCategory(), + token->getPosition(), + token->getLength() + )); + } + return m_d->m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]; + } + case BoWType::BOW_TERM: +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term:" << token->getIdUTF8String(); +#endif + m_d->storePartsInQueue(token); + if (m_d->m_partQueue.empty()) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term: part queue is empty" ; +#endif + (*this)++; + return getElement(); + } +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement term after storePartsInQueue front is:" << m_d->m_partQueue.front(); +#endif + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),m_d->m_partQueue.front()); + return m_d->m_partQueue.front(); + + case BoWType::BOW_NAMEDENTITY: +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement named entity:" << boost::dynamic_pointer_cast(*m_d->m_iterator)->getIdUTF8String() ;//<< Lima::Common::MediaticData::MediaticData::single().getEntityName(static_cast((*m_d->m_iterator))->getNamedEntityType()); + // element itself will be stored in queue as part +#endif + m_d->storePartsInQueue(token); +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement ne after storePartsInQueue front is:" << m_d->m_partQueue.front(); +#endif + m_d->m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),m_d->m_partQueue.front()); + return m_d->m_partQueue.front(); + + // FIXME Change the handling of predicates to take into account their complex structure nature + case BoWType::BOW_PREDICATE: + { + predicate = boost::dynamic_pointer_cast((*m_d->m_iterator)); + uint64_t id=m_d->m_idGenerator->getId(predicate->getString()); + return IndexElement(id, + predicate->getType(), + predicate->getString(), + 0, + predicate->getPosition(), + predicate->getLength(), + predicate->getPredicateType() + ); + } + case BoWType::BOW_NOTYPE: + return IndexElement(); } } } + // Queue was not empty, returning its front else { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::getElement empty:" << m_d->m_partQueue.empty() << "return part queue front" << m_d->m_partQueue.front(); +#endif return m_d->m_partQueue.front(); } + + // Unreachable return IndexElement(); // empty element has id 0 } @@ -234,15 +292,41 @@ IndexElement IndexElementIterator::getElement() //********************************************************************** IndexElementIterator& IndexElementIterator::operator++() { +#ifdef DEBUG_CD + BOWLOGINIT; +#endif + // If queue is empty, try to advance the text iterator to the next BoWToken + // Otherwose, pop the front element and advance the text iterator if the queue is now empty if (m_d->m_partQueue.empty()) { - if (m_d->m_iterator!=m_d->m_iteratorEnd) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ part queue is empty"; +#endif + if (m_d->m_iterator!=m_d->m_iteratorEnd) { m_d->m_iterator++; + // Jump already found elements +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ Jump if necessary"; +#endif + while (m_d->m_iterator != m_d->m_iteratorEnd && + boost::dynamic_pointer_cast((*m_d->m_iterator)) && + m_d->m_alreadyFoundElements.contains( QString::fromUtf8(boost::dynamic_pointer_cast((*m_d->m_iterator))->getIdUTF8String().c_str()) ) ) { + m_d->m_iterator++; + } } } else { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator::operator++ part queue not empty"; +#endif m_d->m_partQueue.pop_front(); if (m_d->m_partQueue.empty()) { // finished for the parts of this token - m_d->m_iterator++; + m_d->m_iterator++; + // Jump already found elements + while (m_d->m_iterator != m_d->m_iteratorEnd && + boost::dynamic_pointer_cast((*m_d->m_iterator)) && + m_d->m_alreadyFoundElements.contains( QString::fromUtf8(boost::dynamic_pointer_cast((*m_d->m_iterator))->getIdUTF8String().c_str()) ) ) { + m_d->m_iterator++; + } } } return *this; @@ -258,48 +342,28 @@ IndexElementIterator IndexElementIterator::operator++(int) { //********************************************************************** // helper functions for iterator //********************************************************************** -bool IndexElementIteratorPrivate::addInPartQueue(const uint64_t id, - const BoWType type, - const LimaString& word, - const uint64_t cat, - const uint64_t position, - const uint64_t length, - const Common::MediaticData::EntityType neType) -{ - if (m_partQueue.size() >= m_maxSizeQueue) { - BOWLOGINIT; - LWARN << "size of queue exceeded"; - return false; - } - - m_partQueue.push_back(IndexElement(id,type,word,cat,position,length,neType)); -// BOWLOGINIT; -// LDEBUG << "add in part queue " << id << ":" -// << word -// << ";size of queue=" << m_partQueue.size() -// ; - return true; -} - void IndexElementIteratorPrivate::getPositionLengthList(const std::vector& structure, PositionLengthList& poslenlist) const { // update position/length list for structure // use previous elements in queue - std::vector::const_iterator - it=structure.begin(),it_end=structure.end(); - for (std::deque::const_iterator - elt=m_partQueue.begin(),elt_end=m_partQueue.end(); - elt!=elt_end; elt++) { - if ((*elt).getId()==*it) { - const PositionLengthList& p=(*elt).getPositionLengthList(); - poslenlist.insert(poslenlist.end(),p.begin(),p.end()); - it++; - if (it==it_end) { - break; - } + for (std::vector::const_iterator it = structure.begin(); it != structure.end(); ++it) { + + QMap::const_iterator found = m_alreadyFoundElements.begin(); + while (found != m_alreadyFoundElements.end() && *it != found.value().getId()) { + ++found; + } + + if (found != m_alreadyFoundElements.end()) { + const PositionLengthList& p = found.value().getPositionLengthList(); + poslenlist.insert(poslenlist.end(), p.begin(), p.end()); + } + else { + BOWLOGINIT + LERROR << "getPositionLengthList failure: element id " << *it << " not found"; } } + // sort positions std::sort(poslenlist.begin(),poslenlist.end()); } @@ -307,6 +371,10 @@ void IndexElementIteratorPrivate::getPositionLengthList(const std::vector= m_maxSizeQueue) { BOWLOGINIT; LWARN << "size of queue exceeded"; @@ -330,13 +398,16 @@ bool IndexElementIteratorPrivate::addInPartQueue(const IndexElement& newElement) } -void IndexElementIteratorPrivate::storePartsInQueue(boost::shared_ptr< Lima::Common::BagOfWords::BoWToken > token, const uint64_t rel) +void IndexElementIteratorPrivate::storePartsInQueue(boost::shared_ptr< Lima::Common::BagOfWords::BoWToken > token) { +#ifdef DEBUG_CD + BOWLOGINIT; + LDEBUG << "IndexElementIteratorPrivate::storePartsInQueue" << token->getIdUTF8String(); +#endif pair, uint64_t> tokenIds; - if (!addPartElementsInQueue(token,tokenIds,rel)) { + if (!addPartElementsInQueue(token,tokenIds,0)) { BOWLOGINIT; - LWARN << "Token contain too many subparts (some are ignored): " - << token->getLemma(); + LWARN << "Token contain too many subparts (some are ignored): " << token->getLemma(); } } @@ -344,40 +415,50 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT pair, uint64_t>& ids_rel, uint64_t rel) { -// BOWLOGINIT; -// LDEBUG << "addPartElementsInQueue:" << token->getLemma() << ", rel=" << rel; +#ifdef DEBUG_CD + BOWLOGINIT; + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue" << token->getIdUTF8String() << rel; +#endif Common::MediaticData::EntityType neType; - - + bool result = false; switch (token->getType()) { - case BOW_TOKEN: - { - // simple token : get Id and push in parts - uint64_t id=m_idGenerator->getId(token->getString()); - ids_rel=make_pair(vector(1,id),rel); - - LimaString lemma=token->getLemma(); - if (lemma.size()==0) { - lemma=token->getInflectedForm(); + case BoWType::BOW_TOKEN: + { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue simple token:" << token->getIdUTF8String(); +#endif + if (!m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + LimaString lemma=token->getLemma(); + if (lemma.size()==0) { + lemma=token->getInflectedForm(); + } + // simple token : get Id and push in parts + uint64_t id=m_idGenerator->getId(token->getString()); + + m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()),IndexElement(id, + token->getType(), + lemma, + token->getCategory(), + token->getPosition(), + token->getLength(), + neType)); + result = addInPartQueue(m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]); + } else { + result = true; + } + ids_rel=make_pair(vector(1,m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())].getId()),rel); + return result; } - - return addInPartQueue(id, - token->getType(), - lemma, - token->getCategory(), - token->getPosition(), - token->getLength(), - neType); - } - case BOW_NAMEDENTITY: - neType=boost::dynamic_pointer_cast(token)->getNamedEntityType(); - break; - case BOW_TERM: - case BOW_PREDICATE: - case BOW_NOTYPE: - default:; + case BoWType::BOW_NAMEDENTITY: + neType=boost::dynamic_pointer_cast(token)->getNamedEntityType(); + break; + case BoWType::BOW_TERM: + case BoWType::BOW_PREDICATE: + case BoWType::BOW_NOTYPE: + default:; } // is a complex token @@ -390,31 +471,47 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT return false; } - if (complexToken->size() == 1) { + if (complexToken->size() == 1) { +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue complex token of size one"; +#endif // only one part, do not get into it // (for instance, named entity with one element) // push simple token in parts - uint64_t id=m_idGenerator->getId(token->getString()); - ids_rel=make_pair(vector(1,id),rel); + if (!m_alreadyFoundElements.contains(QString::fromUtf8(token->getIdUTF8String().c_str()))) + { + uint64_t id=m_idGenerator->getId(token->getString()); + ids_rel=make_pair(vector(1,id),rel); - LimaString lemma=token->getLemma(); - if (lemma.size()==0) { - lemma=token->getInflectedForm(); - } - return addInPartQueue(id, + LimaString lemma=token->getLemma(); + if (lemma.size()==0) { + lemma=token->getInflectedForm(); + } + m_alreadyFoundElements.insert(QString::fromUtf8(token->getIdUTF8String().c_str()), IndexElement(id, token->getType(), lemma, token->getCategory(), token->getPosition(), token->getLength(), - neType); + neType)); + result = addInPartQueue(m_alreadyFoundElements[QString::fromUtf8(token->getIdUTF8String().c_str())]); + } else { + return result = true; + } + return result; } - - ids_rel=make_pair(vector(0),rel); + +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue complex token of size" << complexToken->size(); +#endif + ids_rel=make_pair(vector(),rel); uint64_t nbParts=complexToken->getParts().size(); uint64_t head=complexToken->getHead(); vector, uint64_t> > partIdsRels(nbParts); for (uint64_t i=0; i, uint64_t>& thisPartIdsRels=partIdsRels[i]; uint64_t relType; boost::shared_ptr< BoWRelation > relation=(complexToken->getParts()[i]).getBoWRelation(); @@ -422,11 +519,15 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT if (!addPartElementsInQueue(complexToken->getParts()[i].getBoWToken(),thisPartIdsRels,relType)) { return false; } + if (i==head) { // add ids of the head ids_rel.first.insert(ids_rel.first.end(),thisPartIdsRels.first.begin(),thisPartIdsRels.first.end()); } } +#ifdef DEBUG_CD + LDEBUG << "IndexElementIteratorPrivate::addPartElementsInQueue parts added; combining them"; +#endif // add ids for combined parts vector structure; //current structure in recursive function vector relations; //current relations in recursive function @@ -436,19 +537,6 @@ bool IndexElementIteratorPrivate::addPartElementsInQueue(boost::shared_ptr< BoWT return true; } -/** - * this function is recursive to build all composed elements that contains - * the head and all or parts of the extensions, for all possible values (ids) - * of head and extensions - * - * @param partIdsRels : the possible ids of each part, plus one relation per part - * @param head : the position of the head in the parts - * @param ids : the id list in which new ids are added for combined element - * @param structure : the current structure - * @param i : the current part looked at - * - * @return - */ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( const Lima::Common::BagOfWords::BoWType type, const std::vector, uint64_t> >& partIdsRels, @@ -457,55 +545,89 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( std::pair, uint64_t>& ids_rel, std::vector& structure, std::vector& relations, - const uint64_t i) + const uint64_t current) { -// BOWLOGINIT; -// if (logger.isDebugEnabled()) { -// ostringstream oss; -// for (vector >::const_iterator it=structure.begin(), -// it_end=structure.end(); it!=it_end; it++) { -// oss << (*it).first << "/" << (*it).second << ";"; -// } -// LDEBUG << "addCombinedPartsInQueue: nb parts=" << partIdsRels.size() -// << ", head=" << head << ", current=" << i << ",structure=" << oss.str(); -// } - - if (i>=partIdsRels.size()) { +#ifdef DEBUG_CD + BOWLOGINIT; +#endif + QStringList structureKey; +#ifdef ANTINNO_SPECIFIC + // Modif NAN pour que ça compile sous Visual 2010 + for (auto itElement=structure.begin(),it_end=structure.end(); itElement!=it_end; itElement++) { + structureKey << QString::number(*itElement); +#else + for (auto element: structure) { + structureKey << QString::number(element); +#endif + + } +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: nb parts=" << partIdsRels.size() + << ", head=" << head << ", current=" << current << ", structure=" << structureKey.join(";"); +#endif + bool result = false; + if (current>=partIdsRels.size()) { if (structure.size() == 1) { //just the head: is already in queue +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: just the head: is already in queue"; +#endif return true; } // build indexElement before getting the id : allow to have the // true size of compound (trick: use PositionLengthList to have // the size: number of leaves of the structure), and to avoid // compute the id if size is more than maxCompoundSize - IndexElement compoundElement(0,type,structure,relations,neType); - getPositionLengthList(structure,compoundElement.getPositionLengthList()); - if (compoundElement.getPositionLengthList().size() > m_maxCompoundSize) { - // compound larger than allowed, do not add it in parts, but - // return true anyway (false is reserved for queue size - // overflow) - return true; - } - // at end of parts => add current structure - - uint64_t id=m_idGenerator->getId(structure); -// BOWLOGINIT; -// LDEBUG << "IndexElementIterator: get id from generator " << id; - compoundElement.setId(id); - if (!addInPartQueue(compoundElement)) { - return false; + if (!m_alreadyFoundElements.contains(structureKey.join(";"))) + { + IndexElement compoundElement(0,type,structure,relations,neType); + getPositionLengthList(structure,compoundElement.getPositionLengthList()); + if (compoundElement.getPositionLengthList().size() > m_maxCompoundSize) { + // compound larger than allowed, do not add it in parts, but + // return true anyway (false is reserved for queue overflow) +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: just the head: max compound size exceeded"; +#endif + return true; + } + // at end of parts => add current structure + + uint64_t id=m_idGenerator->getId(structure); +#ifdef DEBUG_CD + LDEBUG << "IndexElementIterator: got id from generator " << id; +#endif + compoundElement.setId(id); + m_alreadyFoundElements.insert(structureKey.join(";"),compoundElement); + if (!addInPartQueue(m_alreadyFoundElements[structureKey.join(";")])) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: queue overflow"; +#endif + return false; + } else { + result = true; + } + } else { + result = true; } - ids_rel.first.push_back(id); - return true; + ids_rel.first.push_back(m_alreadyFoundElements[structureKey.join(";")].getId()); +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: added to ids_rel.first: " << m_alreadyFoundElements[structureKey.join(";")].getId() << "; return" << result; +#endif + return result; } // add possible at end of structure and recursive call - for (auto it=partIdsRels[i].first.begin(),it_end=partIdsRels[i].first.end(); - it!=it_end; it++) { - structure.push_back(*it); - relations.push_back(partIdsRels[i].second); - if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,i+1)) { +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (auto it,partIdsRels[current].first) { +#else + for (auto it:partIdsRels[current].first) { +#endif + structure.push_back(it); + relations.push_back(partIdsRels[current].second); + if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,current+1)) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: recursive call returned false"; +#endif return false; } structure.pop_back(); @@ -514,8 +636,11 @@ bool IndexElementIteratorPrivate::addCombinedPartsInQueue( // if head, stop here: current iterator is head, hence always added // otherwise, recursive call without current iterator (that is an // extension) - if (i!=head) { - if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,i+1)) { + if (current!=head) { + if (!addCombinedPartsInQueue(type, partIdsRels,head,neType,ids_rel,structure,relations,current+1)) { +#ifdef DEBUG_CD + LDEBUG << "addCombinedPartsInQueue: second recursive call returned false"; +#endif return false; } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp index c3481c5b6..6ea8c1c95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.cpp @@ -34,7 +34,7 @@ void BagOfWordsTest2::test_indexElementDefaultConstructor() // IndexElement(); IndexElement el; QVERIFY(el.getId() == 0); - QVERIFY(el.getType() == BOW_NOTYPE); + QVERIFY(el.getType() == BoWType::BOW_NOTYPE); QVERIFY(el.getSimpleTerm() == ""); QVERIFY(el.getCategory() == 0); QVERIFY(el.getPosition() == 0); @@ -60,7 +60,7 @@ void BagOfWordsTest2::test_indexElementConstructor1() // const Common::MediaticData::EntityType neType=Common::MediaticData::EntityType(), // const uint64_t reType=0); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -69,7 +69,7 @@ void BagOfWordsTest2::test_indexElementConstructor1() IndexElement el(id,type,word,cat,position,length,neType); QVERIFY(el.getId() == id); - QVERIFY(el.getType() == BOW_TOKEN); + QVERIFY(el.getType() == BoWType::BOW_TOKEN); QVERIFY(el.getSimpleTerm() == word); QVERIFY(el.getCategory() == cat); QVERIFY(el.getPosition() == position); @@ -93,14 +93,14 @@ void BagOfWordsTest2::test_indexElementConstructor2() // const Common::MediaticData::EntityType neType=Common::MediaticData::EntityType(), // const uint64_t reType=0); uint64_t id = 2; - BoWType type = BOW_TERM; + BoWType type = BoWType::BOW_TERM; std::vector structure; std::vector relations; EntityType neType = EntityType(); IndexElement el(id,type,structure,relations,neType); QVERIFY(el.getId() == id); - QVERIFY(el.getType() == BOW_TERM); + QVERIFY(el.getType() == BoWType::BOW_TERM); QVERIFY(el.getSimpleTerm().isEmpty()); QVERIFY(el.getCategory() == 0); QVERIFY(el.getPosition() == 0); @@ -119,7 +119,7 @@ void BagOfWordsTest2::test_indexElementCopyConstructor() qDebug() << "BagOfWordsTest2::test_indexElementCopyConstructor"; // IndexElement(const IndexElement& ie); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -147,7 +147,7 @@ void BagOfWordsTest2::test_indexElementCopyConstructor() delete el; el = 0; // Test members after deleting original objects QVERIFY(el_copy.getId() == 1); - QVERIFY(el_copy.getType() == BOW_TOKEN); + QVERIFY(el_copy.getType() == BoWType::BOW_TOKEN); QVERIFY(el_copy.getSimpleTerm() == "word"); QVERIFY(el_copy.getCategory() == 0); QVERIFY(el_copy.getPosition() == 0); @@ -166,7 +166,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() qDebug() << "BagOfWordsTest2::test_indexElementCopyConstructor"; // IndexElement(const IndexElement& ie); uint64_t id = 1; - BoWType type = BOW_TOKEN; + BoWType type = BoWType::BOW_TOKEN; QString word = QString::fromUtf8("word"); uint64_t cat = 0; uint64_t position = 0; @@ -175,7 +175,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() IndexElement* el = new IndexElement(id,type,word,cat,position,length,neType); uint64_t id2 = 2; - BoWType type2 = BOW_TERM; + BoWType type2 = BoWType::BOW_TERM; QString word2 = QString::fromUtf8("other"); uint64_t cat2 = 1; uint64_t position2 = 10; @@ -202,7 +202,7 @@ void BagOfWordsTest2::test_indexElementOperatorAffect() delete el; el = 0; // Test members after deleting original objects QVERIFY(el2.getId() == 1); - QVERIFY(el2.getType() == BOW_TOKEN); + QVERIFY(el2.getType() == BoWType::BOW_TOKEN); QVERIFY(el2.getSimpleTerm() == "word"); QVERIFY(el2.getCategory() == 0); QVERIFY(el2.getPosition() == 0); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h index 1845327ec..3eee57311 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest2.h @@ -6,7 +6,7 @@ class BagOfWordsTest2: public QObject { Q_OBJECT -private slots: +private Q_SLOTS: void initTestCase(); void test_indexElementDefaultConstructor(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h index 1770c0716..8219e1182 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/BagOfWords/tests/BagOfWordsTest3.h @@ -6,7 +6,7 @@ class BagOfWordsTest3: public QObject { Q_OBJECT -private slots: +private Q_SLOTS: void initTestCase(); // BoWText with a BoWTerm and a BoWToken diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp index a3c3dcf46..5fc81099c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyCodeManager.cpp @@ -24,6 +24,8 @@ #include "PropertyCodeManager.h" #include "XMLPropertyHandler.h" #include "SymbolicCodeXMLHandler.h" +// FWI 25/02/2015 inclusion pour bénéficier de operator<<(osstream&, QString) +#include "common/Data/LimaString.h" #include @@ -55,16 +57,9 @@ PropertyCodeManager::PropertyCodeManager() void PropertyCodeManager::readFromXmlFile(const std::string& filename) { PROPERTYCODELOGINIT; - - // check that file exists - { - ifstream fin(filename.c_str(), std::ifstream::binary); - if (!fin.good()) { - LERROR << "invalid XMLPropertyCode file " << filename; - throw InvalidConfiguration(); - } - fin.close(); - } +#ifdef DEBUG_LP + LDEBUG << typeid(*this).name() << "PropertyCodeManager::readFromXmlFile" << filename; +#endif #ifdef DEBUG_LP LDEBUG << typeid(*this).name() << "PropertyCodeManager::readFromXmlFile before creating parser"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp index 487e63f00..47214a8f8 100755 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.cpp @@ -112,6 +112,9 @@ void LTR_Text::binaryWriteOn(std::ostream& os) const { uint64_t tokenCounter = 0; writeCodedInt(os, this->size()); + if (this->size()==0) { + return; + } SENTENCE_BOUNDS_T:: const_iterator itSb = m_sentenceBounds.begin(); writeCodedInt(os, *itSb); for (LTR_Text::const_iterator itTok = this->begin(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h index 7ac9b8813..73b5122e1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linearTextRepresentation/ltrText.h @@ -87,6 +87,14 @@ class LIMA_LINEARTEXTREPRESENTATION_EXPORT LTR_Text : public std::vector::clear(); + m_sentenceBounds.clear(); + m_namedEntities.clear(); + } + /** @name accessing */ //@{ SENTS_CONST_ITER_T beginSentenceBounds() const { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp index 7bd513381..4eb14e1b3 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/linguisticData/languageData.cpp @@ -30,11 +30,13 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/common/PropertyCode/PropertyManager.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include #include +#include using namespace std; using namespace Lima::Common::XMLConfigurationFiles; @@ -197,18 +199,39 @@ void LanguageData::initialize( } void LanguageDataPrivate::initPropertyCode( - const std::string& resourcesPath, + const std::string& resourcesPathsStd, XMLConfigurationFileParser& conf) { LDATALOGINIT; - LINFO << "LanguageDataPrivate::initPropertyCode initializes the property coding system"; + LINFO << "LanguageDataPrivate::initPropertyCode initializes the property coding system with resources path" << resourcesPathsStd; try { - std::string propertyFile=resourcesPath + "/" + conf.getModuleGroupParamValue("LinguisticData","Categories","PropertyCodeFile"); + QStringList resourcesPaths= QString::fromUtf8(resourcesPathsStd.c_str()).split(LIMA_PATH_SEPARATOR); + bool propertyCodeFileFound = false; + QString propertyCodeFile = conf.getModuleGroupParamValue("LinguisticData","Categories","PropertyCodeFile").c_str(); + Q_FOREACH(QString resourcesPath, resourcesPaths) + { + QString propertyFile(resourcesPath + "/" + propertyCodeFile); #ifdef DEBUG_LP - LDEBUG << "LanguageDataPrivate::initPropertyCode propertyFile is:" << propertyFile; + LDEBUG << "LanguageDataPrivate::initPropertyCode trying property file" << propertyFile; #endif - m_propCodeManager.readFromXmlFile(propertyFile); + QFileInfo propertyFileInfo(propertyFile); + if (propertyFileInfo.exists()) + { +#ifdef DEBUG_LP + LDEBUG << "LanguageDataPrivate::initPropertyCode reading property file" << propertyFileInfo.filePath(); +#endif + m_propCodeManager.readFromXmlFile(propertyFileInfo.filePath().toUtf8().constData()); + propertyCodeFileFound = true; + // Read at most one property code file for a language + break; + } + } + if (!propertyCodeFileFound) + { + LERROR << "No property code file"< -*/ -/************************************************************************ - * @file positionLengthList.cpp - * @author Mesnard Olivier - * @date - * @version - * copyright Copyright (C) 2003 by CEA LIST - * - ***********************************************************************/ - - -#include - -#include "positionLengthList.h" - -using namespace std; - -QTextStream& operator << (QTextStream& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} - -std::ostream& operator << (std::ostream& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} - -QDebug& operator << (QDebug& os, - const Lima::Common::Misc::PositionLengthList& p) -{ - if (! p.empty()) - { - Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); - os << "(" << (*pos).first << "," << (*pos).second << ")"; - pos++; - while (pos != p.end()) - { - os << "; (" << (*pos).first << "," << (*pos).second << ")"; - pos++; - } - } - return os; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * @file positionLengthList.cpp + * @author Mesnard Olivier + * @date + * @version + * copyright Copyright (C) 2003 by CEA LIST + * + ***********************************************************************/ + + +#include + +#include "positionLengthList.h" + +using namespace std; + +#ifdef ANTINNO_SPECIFIC +// FWI 10/01/2014 : déclarations déplacée dans le namespace Misc +namespace Lima { +namespace Common { +namespace Misc { +#endif + +QTextStream& operator << (QTextStream& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +std::ostream& operator << (std::ostream& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +QDebug& operator << (QDebug& os, + const Lima::Common::Misc::PositionLengthList& p) +{ + if (! p.empty()) + { + Lima::Common::Misc::PositionLengthList::const_iterator pos=p.begin(); + os << "(" << (*pos).first << "," << (*pos).second << ")"; + pos++; + while (pos != p.end()) + { + os << "; (" << (*pos).first << "," << (*pos).second << ")"; + pos++; + } + } + return os; +} + +#ifdef ANTINNO_SPECIFIC +}}} +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h b/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h index cfe7e9ec5..87afc6ab1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/misc/positionLengthList.h @@ -46,9 +46,22 @@ namespace Misc { } // namespace Misc } // namespace Common } // namespace Lima +#ifdef ANTINNO_SPECIFIC +namespace Lima { +namespace Common { +namespace Misc { +// FWI 10/01/2014 : déclarations déplacée dans le namespace Misc +LIMA_LPMISC_EXPORT QTextStream& operator << (QTextStream& os, const PositionLengthList& p); +LIMA_LPMISC_EXPORT std::ostream& operator << (std::ostream& os, const PositionLengthList& p); +LIMA_LPMISC_EXPORT QDebug& operator << (QDebug& os, const PositionLengthList& p); +} // namespace Misc +} // namespace Common +} // namespace Lima +#else LIMA_LPMISC_EXPORT QTextStream& operator << (QTextStream& os, const Lima::Common::Misc::PositionLengthList& p); LIMA_LPMISC_EXPORT std::ostream& operator << (std::ostream& os, const Lima::Common::Misc::PositionLengthList& p); LIMA_LPMISC_EXPORT QDebug& operator << (QDebug& os, const Lima::Common::Misc::PositionLengthList& p); +#endif #endif // POSITION_LENGTH_LIST_H diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp index ddc85a033..f797d6778 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/tgv/TestCaseProcessor.cpp @@ -35,6 +35,8 @@ #include #include +#ifdef ANTINNO_SPECIFIC +#else std::ostream& operator<<(std::ostream& oss, const QStringList& qsl) { oss << "{"; @@ -45,6 +47,7 @@ std::ostream& operator<<(std::ostream& oss, const QStringList& qsl) oss << "}"; return oss; } +#endif namespace Lima { @@ -125,12 +128,13 @@ TestCaseError TestCaseProcessor::evalTestCase( right.removeDuplicates(); QSet sleft; - foreach (QString element, left) + + Q_FOREACH (QString element, left) { sleft.insert(element); } QSet sright; - foreach (QString element, right) + Q_FOREACH (QString element, right) { sright.insert(element); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp index a64d5a3bc..4b1c0626e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/DictionaryData.cpp @@ -1,151 +1,168 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ -#include "DictionaryData.h" - -#include "common/LimaCommon.h" -#include "linguisticProcessing/LinguisticProcessingCommon.h" - -#include -#include - -#include -#include - - -using namespace std; - -namespace Lima -{ - -namespace LinguisticProcessing -{ - -namespace AnalysisDict -{ - -DictionaryData::DictionaryData() : - m_data(0), - m_entriesAddr(), - m_lingPropertiesAddr() -{} - - -DictionaryData::~DictionaryData() -{ - if (m_data) - { - delete [] m_data; - } -} - -void DictionaryData::loadBinaryFile(const std::string& file) -{ - ANALYSISDICTLOGINIT; - LDEBUG << "DictionaryData::loadBinaryFile" << file; - if( !QFileInfo(file.c_str()).exists()) -// if( !boost::filesystem3::exists(file)) - { - std::string mess = "DictionaryData::loadBinaryFile file "; - mess.append(file).append(" not found!"); - throw( std::logic_error( mess ) ); - } - uint64_t dataSize = QFileInfo(file.c_str()).size(); - LDEBUG << "DictionaryData::loadBinaryFile data size: " << dataSize; - m_data = new unsigned char [dataSize]; - if (m_data == NULL) - { - std::string mess = "DictionaryData::loadBinaryFile memory allocation error"; - throw( std::logic_error( mess ) ); - } - - // load data - FILE *dataFile = fopen(file.c_str(), "rb"); - if (dataFile == NULL) - { - std::ostringstream stro (std::ios::in | std::ios::out); - stro << "DictionaryData::loadBinaryFile error cannot open data file " << file; - throw( Lima::IncompleteResources(stro.str()) ); - } - uint64_t readSize = fread(m_data, 1, dataSize, dataFile); //_dataSize = max - fclose(dataFile); - if (readSize != dataSize) - { - std::string mess = "DictionaryData::loadBinaryFile totalDataReadSize != _dataSize "; - throw( std::logic_error( mess ) ); - } - - // parseEntries - unsigned char* p=m_data; - uint64_t nbEntries=readCodedInt(p); - m_entriesAddr.resize(nbEntries); - uint64_t read; - for (vector::iterator entryItr=m_entriesAddr.begin(); - entryItr!=m_entriesAddr.end(); - entryItr++) - { - *entryItr = p; - // go to next entry - read=readCodedInt(p); - if (read == 1) - { - // 1 means delete, next in is length - read=readCodedInt(p); - } - p += read; - } - LDEBUG << "read " << nbEntries << " entries"; - - // parseLingProperties - uint64_t nbLingProp=readCodedInt(p); - m_lingPropertiesAddr.resize(nbLingProp); - for(vector::iterator lingItr=m_lingPropertiesAddr.begin(); - lingItr!=m_lingPropertiesAddr.end(); - lingItr++) - { - *lingItr=p; - read = readCodedInt(p); - p += read; - } - LDEBUG << "read " << nbLingProp << " lingPropsSet"; - Q_ASSERT((uint64_t)(p-m_data) == dataSize); -} - -uint64_t DictionaryData::readCodedInt(unsigned char* &p) -{ - uint64_t val = 0; -// cerr << "start read" << endl; - do - { -// cerr << "val = " << val << " *p = " << (int) *p << endl; - val = (val <<7) + ((*p >> 1) & 0x7F); - } - while (*(p++) & 0x1); -// cerr << "end read val=" << val << endl; - return(val); -} - -} - -} - -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#include "DictionaryData.h" + +#include "common/LimaCommon.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" + +#include +#include + +#include +#include + +#ifdef ANTINNO_SPECIFIC +// FWI 31/10/2013 : ajout #include "antinno.ResourcesIdent.h" +#include "antinno.ResourcesIdent.h" +#endif + +using namespace std; + +namespace Lima +{ + +namespace LinguisticProcessing +{ + +namespace AnalysisDict +{ + +DictionaryData::DictionaryData() : + m_data(0), + m_entriesAddr(), + m_lingPropertiesAddr() +{} + + +DictionaryData::~DictionaryData() +{ + if (m_data) + { + delete [] m_data; + } +} + +void DictionaryData::loadBinaryFile(const std::string& file) +{ + ANALYSISDICTLOGINIT; + LDEBUG << "DictionaryData::loadBinaryFile" << file; + if( !QFileInfo(file.c_str()).exists()) +// if( !boost::filesystem3::exists(file)) + { + std::string mess = "DictionaryData::loadBinaryFile file "; + mess.append(file).append(" not found!"); + throw( std::logic_error( mess ) ); + } + uint64_t dataSize = QFileInfo(file.c_str()).size(); + LDEBUG << "DictionaryData::loadBinaryFile data size: " << dataSize; + m_data = new unsigned char [dataSize]; + if (m_data == NULL) + { + std::string mess = "DictionaryData::loadBinaryFile memory allocation error"; + throw( std::logic_error( mess ) ); + } + + // load data + FILE *dataFile = fopen(file.c_str(), "rb"); + if (dataFile == NULL) + { + std::ostringstream stro (std::ios::in | std::ios::out); + stro << "DictionaryData::loadBinaryFile error cannot open data file " << file; + throw( Lima::IncompleteResources(stro.str()) ); + } + uint64_t readSize = fread(m_data, 1, dataSize, dataFile); //_dataSize = max + fclose(dataFile); + if (readSize != dataSize) + { + std::string mess = "DictionaryData::loadBinaryFile totalDataReadSize != _dataSize "; + throw( std::logic_error( mess ) ); + } + + // parseEntries + unsigned char* p=m_data; + +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 01/03/11 Affiche l'identification Antinno si elle est presente, sinon ne fait rien + if (string((char*)p, 3) == "Ant") { + p +=3; + const std::size_t antLen = p[0] + p[1]*0x100 + p[2]*0x10000 + p[3]*0x1000000; + p +=4; + LINFO << "\n" + file + "\n" + ::antinno::ResourcesIdent((char*)p, antLen).toHumanReadableString(); + p += antLen; + } //JYS 01/03/11 + #endif + + uint64_t nbEntries=readCodedInt(p); + m_entriesAddr.resize(nbEntries); + uint64_t read; + for (vector::iterator entryItr=m_entriesAddr.begin(); + entryItr!=m_entriesAddr.end(); + entryItr++) + { + *entryItr = p; + // go to next entry + read=readCodedInt(p); + if (read == 1) + { + // 1 means delete, next in is length + read=readCodedInt(p); + } + p += read; + } + LDEBUG << "read " << nbEntries << " entries"; + + // parseLingProperties + uint64_t nbLingProp=readCodedInt(p); + m_lingPropertiesAddr.resize(nbLingProp); + for(vector::iterator lingItr=m_lingPropertiesAddr.begin(); + lingItr!=m_lingPropertiesAddr.end(); + lingItr++) + { + *lingItr=p; + read = readCodedInt(p); + p += read; + } + LDEBUG << "read " << nbLingProp << " lingPropsSet"; + Q_ASSERT((uint64_t)(p-m_data) == dataSize); +} + +uint64_t DictionaryData::readCodedInt(unsigned char* &p) +{ + uint64_t val = 0; +// cerr << "start read" << endl; + do + { +// cerr << "val = " << val << " *p = " << (int) *p << endl; + val = (val <<7) + ((*p >> 1) & 0x7F); + } + while (*(p++) & 0x1); +// cerr << "end read val=" << val << endl; + return(val); +} + +} + +} + +} diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp index 77e2a77bc..d4c551ee9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.cpp @@ -21,6 +21,7 @@ #include "AbstractAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -91,6 +92,7 @@ EnhancedAnalysisDictionaryPrivate::EnhancedAnalysisDictionaryPrivate( EnhancedAnalysisDictionaryPrivate::~EnhancedAnalysisDictionaryPrivate() { + delete m_dicoData; } @@ -166,10 +168,11 @@ void EnhancedAnalysisDictionary::init( } try { - std::string binaryFilePath = Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("dictionaryValuesFile"); - resourceFileWatcher().addPath(QString::fromUtf8(binaryFilePath.c_str())); + QString binaryFilePath = Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), + unitConfiguration.getParamsValueAtKey("dictionaryValuesFile").c_str()); + resourceFileWatcher().addPath(binaryFilePath); QWriteLocker locker(&m_d->m_lock); - m_d->m_dicoData->loadBinaryFile(binaryFilePath); + m_d->m_dicoData->loadBinaryFile(binaryFilePath.toUtf8().constData()); } catch (NoSuchList& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp index 03a30ae85..46a0aef0f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaAccessResource.cpp @@ -1,130 +1,188 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/*************************************************************************** - * Copyright (C) 2004-2012 by CEA LIST * - * * - ***************************************************************************/ -#include "FsaAccessResource.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/MediaticData/mediaticData.h" -#include "common/FsaAccess/FsaAccessSpare16.h" - -#include -#include -#include - -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::Common; -using namespace std; - -namespace Lima -{ -namespace Common { - namespace FsaAccess { - extern template class LIMA_FSAACCESS_EXPORT FsaAccessReader16 >,struct boost::property > >,struct boost::no_property,struct boost::no_property,struct boost::listS> >; - } -} -namespace LinguisticProcessing -{ -namespace AnalysisDict -{ - -SimpleFactory fsaAccessResourceFactory(FSAACCESSRESSOURCE_CLASSID); - -FsaAccessResource::FsaAccessResource(QObject* parent) - : AbstractAccessResource(parent),m_fsaAccess(0) -{ - connect(this,SIGNAL(resourceFileChanged(QString)),this,SLOT(accessFileChanged(QString))); -} - - -FsaAccessResource::~FsaAccessResource() -{ - if (m_fsaAccess!=0) - { - delete m_fsaAccess; - } -} - -void FsaAccessResource::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) -{ - /** @addtogroup ResourceConfiguration - * - <group name="..." class="FsaAccess"> - * -  keyFile : file containing the compiled access keys - */ - - ANALYSISDICTLOGINIT; - try - { - string keyfile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("keyFile"); - FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); - resourceFileWatcher().addPath(QString::fromUtf8(keyfile.c_str())); - QWriteLocker locker(&m_lock); - LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); - fsaAccess->read(keyfile); - m_fsaAccess=fsaAccess; - } - catch (NoSuchParam& ) - { - LERROR << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; - throw InvalidConfiguration(); - } - catch (AccessByStringNotInitialized& ) - { - LERROR << "keyfile " - << Common::MediaticData::MediaticData::single().getResourcesPath() - << "/" - << unitConfiguration.getParamsValueAtKey("keyFile") - << " no found for language " - << (int) manager->getInitializationParameters().language; - throw InvalidConfiguration(); - } -} - -AbstractAccessByString* FsaAccessResource::getAccessByString() const - { return m_fsaAccess;} - -void FsaAccessResource::accessFileChanged ( const QString & path ) -{ - ANALYSISDICTLOGINIT; - // Check if the file exists as, when a file is replaced, accessFileChanged can be triggered - // two times, when it is first suppressed and when the new version is available. One should not - // try to load the missing file - if (QFileInfo(path).exists()) - { - LINFO << "FsaAccessResource::accessFileChanged reload" << path; - FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); - QWriteLocker locker(&m_lock); - fsaAccess->read(path.toUtf8().constData()); - delete m_fsaAccess; - m_fsaAccess=fsaAccess; - Q_EMIT accessFileReloaded(m_fsaAccess); - } - else - { - LINFO << "FsaAccessResource::accessFileChanged deleted, ignoring" << path; - } -} - -} // AnalysisDict -} // LinguisticProcessing -} // Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/*************************************************************************** + * Copyright (C) 2004-2012 by CEA LIST * + * * + ***************************************************************************/ +#include "FsaAccessResource.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/MediaticData/mediaticData.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/tools/FileUtils.h" + +#include +#include +#include + +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common; +using namespace std; + +namespace Lima +{ +namespace Common { + namespace FsaAccess { + extern template class LIMA_FSAACCESS_EXPORT FsaAccessReader16 >,struct boost::property > >,struct boost::no_property,struct boost::no_property,struct boost::listS> >; + } +} +namespace LinguisticProcessing +{ +namespace AnalysisDict +{ + +SimpleFactory fsaAccessResourceFactory(FSAACCESSRESSOURCE_CLASSID); + +FsaAccessResource::FsaAccessResource(QObject* parent) + : AbstractAccessResource(parent),m_fsaAccess(0) +{ + connect(this,SIGNAL(resourceFileChanged(QString)),this,SLOT(accessFileChanged(QString))); +} + + +FsaAccessResource::~FsaAccessResource() +{ + if (m_fsaAccess!=0) + { + delete m_fsaAccess; + } +} + +void FsaAccessResource::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + /** @addtogroup ResourceConfiguration + * - <group name="..." class="FsaAccess"> + * -  keyFile : file containing the compiled access keys + */ + + ANALYSISDICTLOGINIT; + try + { + QStringList resourcesPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getResourcesPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString resPath, resourcesPaths) + { + if (QFileInfo(resPath + "/" + unitConfiguration.getParamsValueAtKey("keyFile").c_str()).exists()) + { + string keyfile= (resPath + "/" + unitConfiguration.getParamsValueAtKey("keyFile").c_str()).toUtf8().constData(); + FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); + resourceFileWatcher().addPath(QString::fromUtf8(keyfile.c_str())); + QWriteLocker locker(&m_lock); +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 09/01/11 Saute l'identification Antinno si elle est presente, sinon ne fait rien + //fsaAccess->read(keyfile); + ifstream fileIn(keyfile.c_str(), ios::in | ios::binary); + if (!fileIn.good()) { + LERROR << "cannot open file " << keyfile; + throw InvalidConfiguration(); + } + char magicNumber[3]; + fileIn.read(magicNumber, 3); + if (string(magicNumber, 3) == "Ant") + { + unsigned char intLe[4]; //UNSIGNED obligatoire + fileIn.read((char*)intLe, 4); + const std::size_t antLen = intLe[0] + intLe[1]*0x100 + intLe[2]*0x10000 + intLe[3]*0x1000000; + const std::size_t pos = fileIn.tellg(); + fileIn.seekg(pos+antLen, ios::beg); //saute l'identification Antinno + } + else + fileIn.seekg(0, ios::beg); //pas un fichier repere par Antinno + fsaAccess->read(fileIn); + //JYS 09/01/11 + LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); +#else + LINFO << "FsaAccessResource::init read keyFile" << QString::fromUtf8(keyfile.c_str()); + fsaAccess->read(keyfile); +#endif + m_fsaAccess=fsaAccess; + break; + } + } + if (!m_fsaAccess) { + LERROR << "resource file" << unitConfiguration.getParamsValueAtKey("keyFile") << "not found in path" + << Common::MediaticData::MediaticData::single().getResourcesPath(); + } + + + } + catch (NoSuchParam& ) + { +#ifdef ANTINNO_SPECIFIC + ::std::ostringstream oss; + oss << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(oss.str()); +#else + LERROR << "no param 'keyFile' in FsaAccessResource group for language " << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(); +#endif + } + catch (AccessByStringNotInitialized& ) + { +#ifdef ANTINNO_SPECIFIC + ::std::ostringstream oss; + oss << "keyfile " + << Common::MediaticData::MediaticData::single().getResourcesPath() + << "/" + << unitConfiguration.getParamsValueAtKey("keyFile") + << " no found for language " + << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(oss.str()); +#else + LERROR << "keyfile " + << Common::MediaticData::MediaticData::single().getResourcesPath() + << "/" + << unitConfiguration.getParamsValueAtKey("keyFile") + << " no found for language " + << (int) manager->getInitializationParameters().language; + throw InvalidConfiguration(); +#endif + } +} + +AbstractAccessByString* FsaAccessResource::getAccessByString() const + { return m_fsaAccess;} + +void FsaAccessResource::accessFileChanged ( const QString & path ) +{ + ANALYSISDICTLOGINIT; + // Check if the file exists as, when a file is replaced, accessFileChanged can be triggered + // two times, when it is first suppressed and when the new version is available. One should not + // try to load the missing file + if (QFileInfo(path).exists()) + { + LINFO << "FsaAccessResource::accessFileChanged reload" << path; + FsaAccess::FsaAccessSpare16* fsaAccess=new FsaAccess::FsaAccessSpare16(); + QWriteLocker locker(&m_lock); + fsaAccess->read(path.toUtf8().constData()); + delete m_fsaAccess; + m_fsaAccess=fsaAccess; + Q_EMIT accessFileReloaded(m_fsaAccess); + } + else + { + LINFO << "FsaAccessResource::accessFileChanged deleted, ignoring" << path; + } +} + +} // AnalysisDict +} // LinguisticProcessing +} // Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp index 447282c90..a426f6dc9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/FsaRwAccessResource.cpp @@ -23,6 +23,7 @@ #include "FsaRwAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/FsaAccess/FsaAccessBuilderRandom16.h" @@ -68,9 +69,35 @@ void FsaRwAccessResource::init( FsaAccess::FsaAccessBuilderRandom16* fsaAccess; try { - string keyfile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("keyFile"); + QString keyfile = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), unitConfiguration.getParamsValueAtKey("keyFile").c_str()); fsaAccess=new FsaAccess::FsaAccessBuilderRandom16(); - fsaAccess->read(keyfile); + +#ifdef ANTINNO_SPECIFIC + // FWI 31/10/2013 : ajout code de lecture de l'entête "Ant" (copie code JYS de S2) + //JYS 09/01/11 Saute l'identification Antinno si elle est presente, sinon ne fait rien + auto* const pFileName = keyfile.toUtf8().constData(); + ifstream fileIn(pFileName, ios::in | ios::binary); + if (!fileIn.good()) { + LERROR << "cannot open file " << pFileName; + throw InvalidConfiguration(); + } + char magicNumber[3]; + fileIn.read(magicNumber, 3); + if (string(magicNumber, 3) == "Ant") { + unsigned char intLe[4]; //UNSIGNED obligatoire + fileIn.read((char*)intLe, 4); + const std::size_t antLen = intLe[0] + intLe[1]*0x100 + intLe[2]*0x10000 + intLe[3]*0x1000000; + std::streamoff pos = fileIn.tellg(); + fileIn.seekg(pos+antLen, ios::beg); //saute l'identification Antinno + } + else fileIn.seekg(0, ios::beg); //pas un fichier repere par Antinno + fsaAccess->read(fileIn); + //JYS 09/01/11 +#else + fsaAccess->read(keyfile.toUtf8().constData()); +#endif + + m_fsaAccess=fsaAccess; m_fsaRwAccess=fsaAccess; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp index ac76bd05e..e6d279aba 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDict/MultiLevelAnalysisDictionary.cpp @@ -27,6 +27,7 @@ #include "AbstractAccessResource.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -85,9 +86,9 @@ void MultiLevelAnalysisDictionary::init( hasMainKeys=true; m_mainKeySize=ldico.keys->getSize(); } - string dataFile=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + *dataIt; + QString dataFile = Common::Misc::findFileInPaths( Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), (*dataIt).c_str()); ldico.data=new DictionaryData(); - ldico.data->loadBinaryFile(dataFile); + ldico.data->loadBinaryFile(dataFile.toUtf8().constData()); m_dicos.push_back(ldico); keyIt++; dataIt++; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp index ca7bfa31c..b9670f2d1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowDumper.cpp @@ -180,13 +180,41 @@ LimaStatusCode BowDumper::process( analysis.setData("SyntacticData",syntacticData); } - // build BoWText from the result of the analysis BoWText bowText; bowText.lang=metadata->getMetaData("Lang"); buildBoWText(annotationData, syntacticData, bowText,analysis,anagraph,posgraph); + + + +#ifdef ANTINNO_SPECIFIC + // on exclus de la liste les entités xml qui précèdent l'offset et on recalle les positions par rapport au début du noeud en cours d'analyse + uint64_t offset = metadata->getStartOffset(); + QMap shiftFrom; + auto const& m = handler->shiftFrom(); + if (!m.isEmpty()) + { + uint64_t diff = 0; + for (auto it=m.constBegin()+1; it!=m.constEnd(); ++it) + { + //::std::cout << it.key() << " " << it.value() << " " << (it-1).value() << " " << offset << ::std::endl; + if (it.key()+(it-1).value() >= offset) + break; + diff = it.value(); + //::std::cout << "diff: " << diff << ::std::endl; + } + for (auto it=m.constBegin(); it!=m.constEnd(); ++it) + if (it.value() > diff) + { + shiftFrom.insert(it.key()+diff, it.value()-diff); // empirique mais ça a l'air de marcher + //::std::cout << "it.key()+diff: " << it.key()+diff << "it.value()-diff: " << it.value()-diff << ::std::endl; + } + } + BoWBinaryWriter writer(shiftFrom); +#else BoWBinaryWriter writer(handler->shiftFrom()); +#endif DumperStream* dstream=initialize(analysis); #ifdef DEBUG_LP diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp index 1e37716f8..2f5742706 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.cpp @@ -233,6 +233,7 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< BoW const AnnotationData* annotationData, std::set< LinguisticGraphVertex >& visited) const { + #ifdef DEBUG_LP DUMPERLOGINIT; LDEBUG << "BowGenerator::buildTermFor annot:" << vx << "; pointing on annot:"<, boost::shared_ptr< Abs LDEBUG << "BowGenerator::createAbstractBoWElement " << v << " has " << anaVertices.size() << " matching vertices in analysis graph"; #endif + bool createdSpecificEntity(false); + // note: anaVertices size should be 0 or 1 - //for (std::set< uint64_t >::const_iterator anaVerticesIt = anaVertices.begin(); portage 32 64 - for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); - anaVerticesIt != anaVertices.end(); anaVerticesIt++) +#ifdef ANTINNO_SPECIFIC + Q_FOREACH ( AnnotationGraphVertex anaVertex, anaVertices) +#else + for ( AnnotationGraphVertex anaVertex : anaVertices) +#endif { - #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << *anaVerticesIt; +#ifdef DEBUG_LP +#ifdef ANTINNO_SPECIFIC + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << anaVertex << " ----------------------------"; +#else + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at analysis graph vertex " << anaVertex; +#endif +#endif + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",anaVertex,"annot"); +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (AnnotationGraphVertex matchVertex, matches) +#else + for (AnnotationGraphVertex matchVertex: matches) #endif - //std::set< uint64_t > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); portage 32 64 - std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); - //for (std::set< uint64_t >::const_iterator it = matches.begin(); portage 32 64 - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) { #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << *it; + LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << matchVertex; #endif - if (annotationData->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + if (annotationData->hasAnnotation(matchVertex, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - boost::shared_ptr< BoWToken > se = createSpecificEntity(v,*it, annotationData, anagraph, posgraph, offsetBegin, false); + boost::shared_ptr< BoWToken > se = createSpecificEntity(v,matchVertex, annotationData, anagraph, posgraph, offsetBegin, false); if (se != 0) { #ifdef DEBUG_LP @@ -616,6 +626,7 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs se->setVertex(v); abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),se)); // visited.insert(v); + createdSpecificEntity=true; break; } } @@ -631,16 +642,18 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement there are " << matches.size() << " annotation graph vertices matching the current PsGraph vertex " << v; #endif - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (AnnotationGraphVertex vx, matches) +#else + for (AnnotationGraphVertex vx: matches) +#endif { - AnnotationGraphVertex vx=*it; #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Looking at annotation graph vertex " << vx; #endif - if (annotationData->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - boost::shared_ptr< BoWToken > se = createSpecificEntity(v,*it, annotationData, anagraph, posgraph, offsetBegin); + boost::shared_ptr< BoWToken > se = createSpecificEntity(v,vx, annotationData, anagraph, posgraph, offsetBegin); if (se != 0) { #ifdef DEBUG_LP @@ -652,9 +665,9 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs return abstractBowEl; } } - else if (annotationData->hasIntAnnotation(*it, Common::Misc::utf8stdstring2limastring("CpdTense"))) + else if (annotationData->hasIntAnnotation(vx, Common::Misc::utf8stdstring2limastring("CpdTense"))) { - boost::shared_ptr< BoWToken > ct = createCompoundTense(*it, annotationData, anagraph, posgraph, offsetBegin, visited); + boost::shared_ptr< BoWToken > ct = createCompoundTense(vx, annotationData, anagraph, posgraph, offsetBegin, visited); if (ct != 0) { #ifdef DEBUG_LP @@ -666,20 +679,48 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs return abstractBowEl; } } - else if (annotationData->hasStringAnnotation(*it, Common::Misc::utf8stdstring2limastring("Predicate"))) + else if (annotationData->hasStringAnnotation(vx, Common::Misc::utf8stdstring2limastring("Predicate"))) { #ifdef DEBUG_LP LDEBUG << "BowGenerator::createAbstractBoWElement Found a predicate in the PosGraph annnotation graph matching"; #endif - boost::shared_ptr< BoWPredicate > bP=createPredicate(v, *it, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway); - if (bP!=0){ -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; + + MorphoSyntacticData* data = get(vertex_data, posgraph, v); + bool toKeep = true; + if (data!=0) + { + #ifdef ANTINNO_SPECIFIC + Q_FOREACH (const auto& elem, *data) +#else + for (const auto& elem: *data) #endif - abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); - // visited.insert(v); - return abstractBowEl; + { + if (!keepAnyway && !shouldBeKept(elem)) + { + toKeep = false; + break; + } + } + } + if (toKeep) + { +#ifdef ANTINNO_SPECIFIC + Q_FOREACH (boost::shared_ptr< BoWPredicate> bP, createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway)) +#else + for (boost::shared_ptr< BoWPredicate >& bP: createPredicate(v, vx, annotationData, anagraph, posgraph, offsetBegin, visited, keepAnyway)) +#endif + { + if (bP!=0) + { + #ifdef DEBUG_LP + LDEBUG << "BowGenerator::createAbstractBoWElement created a predicate" ; + #endif + abstractBowEl.push_back(std::make_pair(boost::shared_ptr< BoWRelation >(),bP)); + // visited.insert(v); + // return abstractBowEl; + } + } } } else @@ -690,12 +731,11 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs } } - // bow tokens have been created for specific entities on the before PoS // tagging graph. return them if (!abstractBowEl.empty()) { - return abstractBowEl; +// return abstractBowEl; } const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); @@ -705,7 +745,14 @@ std::vector< std::pair< boost::shared_ptr< BoWRelation >, boost::shared_ptr< Abs std::set > alreadyCreated; std::pair predNormCode = std::make_pair(StringsPoolIndex(0),LinguisticCode(0)); - + + if (createdSpecificEntity) { + // a specific entity has been created on the analysis graph: do not output a token + // (RB: do that here so that the vertex on the posgraph can also be analyzed: should test is this is + // needed or if we only need to place the return just after the creation of the named entity) + return abstractBowEl; + } + if (data!=0) { for (auto it=data->begin(); it!=data->end(); it++) @@ -1116,115 +1163,137 @@ boost::shared_ptr< BoWNamedEntity > BowGenerator::createSpecificEntity( } -boost::shared_ptr< BoWPredicate > BowGenerator::createPredicate( +QList< boost::shared_ptr< BoWPredicate > > BowGenerator::createPredicate( const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, const AnnotationData* annotationData, const LinguisticGraph& anagraph, const LinguisticGraph& posgraph, const uint64_t offset, std::set< LinguisticGraphVertex >& visited, bool keepAnyway) const { - DUMPERLOGINIT; #ifdef DEBUG_LP + DUMPERLOGINIT; LDEBUG << "BowGenerator::createPredicate ling:" << lgv << "; annot:" << agv; #endif - boost::shared_ptr< BoWPredicate > bowP(new BoWPredicate()); + QList< boost::shared_ptr< BoWPredicate > > result; Token* token = get(vertex_token, posgraph, lgv); - bowP->setPosition(offset+token->position()); - bowP->setLength(token->length()); // FIXME handle the ambiguous case when there is several class values for the predicate QStringList predicateIds=annotationData->stringAnnotation(agv,Common::Misc::utf8stdstring2limastring("Predicate")).split("|"); +#ifdef DEBUG_LP if (predicateIds.size()>1) { - LERROR << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; + LDEBUG << "BowGenerator::createPredicate Predicate has" << predicateIds.size() << "values:" << predicateIds; } - // FIXME replace the hardcoded VerbNet by a value from configuration - LWARN << "BowGenerator::createPredicate FIXME replace the hardcoded VerbNet by a value from configuration at" << __FILE__ << ", line"<< __LINE__; - LimaString predicate=LimaString("VerbNet.%1").arg(predicateIds.first()); - try +#endif + + + // FIXED replace the hardcoded VerbNet by a value from configuration + // LimaString predicate=predicateIds.first(); + // The fix should work only with FrameNet annotations. VerbNet does not assure to have the same + // number of roles in each list as the number of predicates + for (int i = 0 ; i < predicateIds.size(); i++) { - EntityType predicateEntity= Common::MediaticData::MediaticData::single().getEntityType(predicate); + LimaString predicate = predicateIds[i]; + try + { + EntityType predicateEntity= Common::MediaticData::MediaticData::single().getEntityType(predicate); #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate The role(s) related to "<< predicate << " is/are "; + LDEBUG << "BowGenerator::createPredicate The role(s) related to "<< predicate << " is/are "; #endif - AnnotationGraph annotGraph=annotationData->getGraph(); - AnnotationGraphOutEdgeIt outIt, outIt_end; - boost::tie(outIt, outIt_end) = boost::out_edges(agv, annotationData->getGraph()); - QMultiMap > roles; - const LimaString typeAnnot="SemanticRole"; - for (; outIt != outIt_end; outIt++) - { - // FIXME handle the ambiguous case when there is several values for each role - const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); - QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); - if (semRoleIds.size()>1) - { - LERROR << "BowGenerator::createPredicate Role has" << semRoleIds.size() << "values:" << semRoleIds; - } - // FIXME replace the hardcoded VerbNet by a value from configuration - LimaString semRole = LimaString("VerbNet.%1").arg(semRoleIds.first()); - LDEBUG << semRole; - try + AnnotationGraph annotGraph=annotationData->getGraph(); + AnnotationGraphOutEdgeIt outIt, outIt_end; + boost::tie(outIt, outIt_end) = boost::out_edges(agv, annotationData->getGraph()); + QMultiMap > roles; + const LimaString typeAnnot="SemanticRole"; + for (; outIt != outIt_end; outIt++) { - EntityType semRoleEntity = Common::MediaticData::MediaticData::single().getEntityType(semRole); - std::set< LinguisticGraphVertex > posGraphSemRoleVertices = annotationData->matches("annot", semRoleVx, "PosGraph"); - if (!posGraphSemRoleVertices.empty()) + // FIXME handle the ambiguous case when there is several values for each role + const AnnotationGraphVertex semRoleVx=boost::target(*outIt, annotGraph); + QStringList semRoleIds = annotationData->stringAnnotation(agv,semRoleVx,typeAnnot).split("|"); + if (predicateIds.size() != semRoleIds.size()) + { + DUMPERLOGINIT; + LERROR << "BowGenerator::createPredicate predicateIds and semRoleIds sizes are different:" << predicateIds.size() << "and" << semRoleIds.size(); + LERROR << "BowGenerator::createPredicate abort this predicate creation"; + return result; + } + Q_ASSERT(predicateIds.size() == semRoleIds.size()); + LimaString semRole = semRoleIds[i]; +#ifdef DEBUG_LP + LDEBUG << semRole; +#endif + if (semRole.isEmpty()) continue; + try { - LinguisticGraphVertex posGraphSemRoleVertex = *(posGraphSemRoleVertices.begin()); - if (posGraphSemRoleVertex == lgv) + EntityType semRoleEntity = Common::MediaticData::MediaticData::single().getEntityType(semRole); + std::set< LinguisticGraphVertex > posGraphSemRoleVertices = annotationData->matches("annot", semRoleVx, "PosGraph"); + if (!posGraphSemRoleVertices.empty()) { - LERROR << "BowGenerator::createPredicate role vertex is the same as the trigger vertex. Abort this role."; - continue; - } + LinguisticGraphVertex posGraphSemRoleVertex = *(posGraphSemRoleVertices.begin()); + if (posGraphSemRoleVertex == lgv) + { + DUMPERLOGINIT; + LERROR << "BowGenerator::createPredicate role vertex is the same as the trigger vertex. Abort this role."; + continue; + } #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Calling createAbstractBoWElement on PoS graph vertex" << posGraphSemRoleVertex; + LDEBUG << "BowGenerator::createPredicate Calling createAbstractBoWElement on PoS graph vertex" << posGraphSemRoleVertex; #endif - std::vector, boost::shared_ptr< AbstractBoWElement > > > semRoleTokens = createAbstractBoWElement(posGraphSemRoleVertex, anagraph,posgraph, offset, annotationData, visited, keepAnyway); + std::vector, boost::shared_ptr< AbstractBoWElement > > > semRoleTokens = createAbstractBoWElement(posGraphSemRoleVertex, anagraph,posgraph, offset, annotationData, visited, keepAnyway); #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Created "<< semRoleTokens.size()<<"token for the role associated to " << predicate; + LDEBUG << "BowGenerator::createPredicate Created "<< semRoleTokens.size()<<"token for the role associated to " << predicate; #endif -// if (semRoleTokens[0].second!="") - if (!semRoleTokens.empty()) + // if (semRoleTokens[0].second!="") + if (!semRoleTokens.empty()) + { + roles.insert(semRoleEntity, semRoleTokens[0].second); + } + } + else { - roles.insert(semRoleEntity, semRoleTokens[0].second); +#ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Found no matching for the semRole in the annot graph"; +#endif } } - else + catch (const Lima::LimaException& e) { -#ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Found no matching for the semRole in the annot graph"; -#endif + DUMPERLOGINIT; + LERROR << "BowGenerator::createPredicate Unknown semantic role" << semRole << ";" << e.what(); } } - catch (const Lima::LimaException& e) - { - LERROR << "BowGenerator::createPredicate Unknown semantic role" << semRole << ";" << e.what(); - } - } - bowP->setPredicateType(predicateEntity); - Common::MediaticData::EntityType pEntityType=bowP->getPredicateType(); - LDEBUG << "BowGenerator::createPredicate Created a Predicate for the verbal class " << Common::MediaticData::MediaticData::single().getEntityName(pEntityType); - if (!roles.empty()) - { - bowP->setRoles(roles); - QMultiMap >pRoles=bowP->roles(); - for (auto it = pRoles.begin(); - it != pRoles.end(); it++) + boost::shared_ptr< BoWPredicate > bowP(new BoWPredicate()); + bowP->setPosition(offset+token->position()); + bowP->setLength(token->length()); + bowP->setPredicateType(predicateEntity); + Common::MediaticData::EntityType pEntityType=bowP->getPredicateType(); +#ifdef DEBUG_LP + LDEBUG << "BowGenerator::createPredicate Created a Predicate for the verbal class " << Common::MediaticData::MediaticData::single().getEntityName(pEntityType); +#endif + if (!roles.empty()) { - boost::shared_ptr< BoWToken> outputRoles=boost::dynamic_pointer_cast(it.value()); - if (outputRoles != 0) + bowP->setRoles(roles); + QMultiMap >pRoles=bowP->roles(); + for (auto it = pRoles.begin(); + it != pRoles.end(); it++) { - LimaString roleLabel=Common::MediaticData::MediaticData::single().getEntityName(it.key()); + boost::shared_ptr< BoWToken> outputRoles=boost::dynamic_pointer_cast(it.value()); + if (outputRoles != 0) + { + LimaString roleLabel=Common::MediaticData::MediaticData::single().getEntityName(it.key()); #ifdef DEBUG_LP - LDEBUG << "BowGenerator::createPredicate Associated "<< QString::fromUtf8(outputRoles->getOutputUTF8String().c_str()) << " to it" << "via the semantic role label "<< roleLabel ; + LDEBUG << "BowGenerator::createPredicate Associated "<< QString::fromUtf8(outputRoles->getOutputUTF8String().c_str()) << " to it" << "via the semantic role label "<< roleLabel ; #endif + } } } + result.append(bowP); + } + catch (const Lima::LimaException& e) + { + DUMPERLOGINIT; + LERROR << "BowGenerator::createPredicate Unknown predicate" << predicate << ";" << e.what(); + return QList< boost::shared_ptr< BoWPredicate > >(); } - return bowP; - } - catch (const Lima::LimaException& e) - { - LERROR << "BowGenerator::createPredicate Unknown predicate" << predicate << ";" << e.what(); - return boost::shared_ptr< BoWPredicate >(); } + return result; } boost::shared_ptr< BoWPredicate > BowGenerator::createPredicate( @@ -1380,6 +1449,13 @@ std::vector BowGenerator::createNEParts( #ifdef DEBUG_LP DUMPERLOGINIT; #endif + +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: createNEParts(...)"; +#endif +#endif + const LinguisticGraph& graph = (frompos?posgraph:anagraph); const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); @@ -1471,6 +1547,12 @@ std::vector BowGenerator::createNEParts( const Token* token = get(vertex_token, graph, *m); const MorphoSyntacticData* data = get(vertex_data, graph, *m); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: createNEParts(...) token->form(): " << token->form(); +#endif +#endif + if (data!=0 && !data->empty()) { const LinguisticElement& elem=*(data->begin()); @@ -1498,6 +1580,13 @@ std::vector BowGenerator::createNEParts( category, token->position(), token->length())); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "BowGenerator: token->stringForm(): " << token->stringForm(); + LDEBUG << "BowGenerator: sp[/*elem.normalizedForm*/ " << elem.normalizedForm << "]: \"" << sp[elem.normalizedForm] << "\""; +#endif +#endif + } } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h index 0922fafd6..703487b33 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/BowGeneration.h @@ -123,7 +123,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT BowGenerator bool keepAnyway = false) const; /** - * Builds a BoWPredicate corresoonding to a semantic relation (an edge in the + * Builds a BoWPredicate corresponding to a semantic relation (an edge in the * annotation graph holding a SemanticRelation annotation * * @param lgvs source linguistic graph vertex @@ -213,7 +213,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT BowGenerator // Common::BagOfWords::BoWPredicate* createPredicate(const Common::MediaticData::EntityType& t, QMultiMap roles) const; - boost::shared_ptr< Common::BagOfWords::BoWPredicate > createPredicate(const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, + QList< boost::shared_ptr< Common::BagOfWords::BoWPredicate > > createPredicate(const LinguisticGraphVertex& lgv, const AnnotationGraphVertex& agv, const Common::AnnotationGraphs::AnnotationData* annotationData, const LinguisticGraph& anagraph, const LinguisticGraph& posgraph, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt index f1967236a..10d787e05 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/CMakeLists.txt @@ -31,7 +31,6 @@ SET(lima-lp-analysisdumpers_LIB_SRCS NullDumper.cpp StopList.cpp TextDumper.cpp - ConllDumper.cpp fullXmlDumper.cpp linearTextRepresentationDumper.cpp linearTextRepresentationLogger.cpp diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp deleted file mode 100644 index d249547e4..000000000 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.cpp +++ /dev/null @@ -1,541 +0,0 @@ -/* - Copyright 2002-2014 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ - -#include "ConllDumper.h" -#include "common/MediaProcessors/DumperStream.h" -#include "common/time/traceUtils.h" -#include "common/Data/strwstrtools.h" -#include "common/MediaticData/mediaticData.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "linguisticProcessing/LinguisticProcessingCommon.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" -#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" -#include "linguisticProcessing/core/SyntacticAnalysis/SyntacticData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticDataUtils.h" -#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" -#include "common/misc/AbstractAccessByString.h" -#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.h" -#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/relation.h" -#include "linguisticProcessing/core/SemanticAnalysis/LimaConllTokenIdMapping.h" - -#include -#include -#include - -#include - -using namespace Lima::Common; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::Common::AnnotationGraphs; -using namespace Lima::LinguisticProcessing::SpecificEntities; -using namespace Lima::LinguisticProcessing::SemanticAnalysis; -using namespace Lima::LinguisticProcessing::SyntacticAnalysis; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; - -namespace Lima -{ - -namespace LinguisticProcessing -{ - -namespace AnalysisDumpers -{ - -SimpleFactory conllDumperFactory(CONLLDUMPER_CLASSID); - -class ConllDumperPrivate -{ - friend class ConllDumper; - ConllDumperPrivate(); - - virtual ~ConllDumperPrivate(); - - /** - * @brief Collect all annotation tokens corresponding to a predicate of the - * sentence starting at @ref sentenceBegin and finishing at @ref sentenceEnd - */ - QMultiMap collectPredicateTokens( - Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd); - - MediaId m_language; - std::string m_property; - const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; - const Common::PropertyCode::PropertyManager* m_propertyManager; - const Common::PropertyCode::PropertyManager* m_timeManager; //Ajout - const Common::PropertyCode::PropertyAccessor* m_timeAccessor; //Ajout - - std::string m_graph; - std::string m_sep; - std::string m_sepPOS; - std::string m_verbTenseFlag; //Ajout - QMap m_conllLimaDepMapping; - std::string m_suffix; -}; - - -ConllDumperPrivate::ConllDumperPrivate(): -m_language(0), -m_property("MICRO"), -m_propertyAccessor(0), -m_propertyManager(0), -m_graph("PosGraph"), -m_sep(" "), -m_sepPOS("#"), -m_conllLimaDepMapping(), -m_suffix(".conll") -{ -} - -ConllDumperPrivate::~ConllDumperPrivate() -{} - -ConllDumper::ConllDumper(): -AbstractTextualAnalysisDumper(), -m_d(new ConllDumperPrivate()) -{ -} - -ConllDumper::~ConllDumper() -{ - delete m_d; -} - -void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) -{ - DUMPERLOGINIT; - AbstractTextualAnalysisDumper::init(unitConfiguration,manager); - m_d->m_language=manager->getInitializationParameters().media; - try - { - m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); - } - catch (NoSuchParam& ) {} // keep default value - const Common::PropertyCode::PropertyCodeManager& codeManager=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager(); - m_d->m_propertyAccessor=&codeManager.getPropertyAccessor("MICRO"); - - try - { - m_d->m_verbTenseFlag=unitConfiguration.getParamsValueAtKey("verbTenseFlag"); - } - catch (NoSuchParam& ) { - m_d->m_verbTenseFlag=std::string("False"); - } // keep default value - - try - { - m_d->m_sep=unitConfiguration.getParamsValueAtKey("sep"); - } - catch (NoSuchParam& ) {} // keep default value - - try - { - m_d->m_sepPOS=unitConfiguration.getParamsValueAtKey("sepPOS"); - } - catch (NoSuchParam& ) {} // keep default value - - try - { - m_d->m_property=unitConfiguration.getParamsValueAtKey("property"); - } - catch (NoSuchParam& ) {} // keep default value - try - { - m_d->m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); - } - catch (NoSuchParam& ) {} // keep default value - m_d->m_propertyManager=&codeManager.getPropertyManager(m_d->m_property); - - m_d->m_timeManager=&codeManager.getPropertyManager("TIME"); - m_d->m_timeAccessor=&codeManager.getPropertyAccessor("TIME"); - - try { - std::string resourcePath = Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string mappingFile = resourcePath + "/" + unitConfiguration.getParamsValueAtKey("mappingFile"); - std::ifstream ifs(mappingFile, std::ifstream::binary); - if (!ifs.good()) - { - LERROR << "ERROR: cannot open"+ mappingFile; - throw InvalidConfiguration(); - } - while (ifs.good() && !ifs.eof()) - { - std::string line; - while(getline(ifs, line)) // as long as we can put the line on "line" - { - QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); - m_d->m_conllLimaDepMapping.insert(strs[0],strs[1]); - } - } - - } catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no parameter 'mappingFile' in ConllDumper group" << " !"; - throw InvalidConfiguration(); - } -} - -LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const -{ - DUMPERLOGINIT; - LDEBUG << "ConllDumper::process"; - - LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); - if (metadata == 0) { - LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; - return MISSING_DATA; - } - AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); - if (annotationData == 0) { - LERROR << "ConllDumper::process no AnnotationData ! abort"; - return MISSING_DATA; - } - AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph));//est de type PosGraph et non pas AnalysisGraph - if (tokenList==0) { - LERROR << "ConllDumper::process graph " << m_d->m_graph << " has not been produced: check pipeline"; - return MISSING_DATA; - } - LinguisticGraph* graph=tokenList->getGraph(); - SegmentationData* sd=static_cast(analysis.getData("SentenceBoundaries")); - if (sd==0) { - LERROR << "ConllDumper::process no SentenceBoundaries! abort"; - return MISSING_DATA; - } - - SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); - if (syntacticData==0) - { - syntacticData=new SyntacticData(tokenList,0); - syntacticData->setupDependencyGraph(); - analysis.setData("SyntacticData",syntacticData); - } - const DependencyGraph* depGraph = syntacticData-> dependencyGraph(); - - QScopedPointer dstream(initialize(analysis)); - - std::map< LinguisticGraphVertex, std::pair > vertexDependencyInformations; - - std::vector::iterator sbItr=(sd->getSegments().begin()); - uint64_t nbSentences((sd->getSegments()).size()); - LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; - LinguisticGraphVertex sentenceBegin = sbItr->getFirstVertex(); - LinguisticGraphVertex sentenceEnd = sbItr->getLastVertex(); - - - const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_d->m_language); -// for (auto im=m_d->m_conllLimaDepMapping.begin();im!=m_d->m_conllLimaDepMapping.end();im++) -// { -// LDEBUG << "("<< (*im).first<< "," << (*im).second << ")" << endl; -// } - - LimaConllTokenIdMapping* limaConllTokenIdMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); - if (limaConllTokenIdMapping == 0) - { - limaConllTokenIdMapping = new LimaConllTokenIdMapping(); - analysis.setData("LimaConllTokenIdMapping", limaConllTokenIdMapping); - } - int sentenceNb=0; - - while (sbItr != sd->getSegments().end() ) //for each sentence - { - sentenceNb++; - sentenceBegin=sbItr->getFirstVertex(); - sentenceEnd=sbItr->getLastVertex(); - std::mapsegmentationMapping;//mapping the two types of segmentations (Lima and conll) - std::mapsegmentationMappingReverse; - - LDEBUG << "ConllDumper::process begin - end: " << sentenceBegin << " - " << sentenceEnd; - //LinguisticGraphOutEdgeIt outItr,outItrEnd; - QQueue toVisit; - QSet visited; - toVisit.enqueue(sentenceBegin); - int tokenId = 0; - LinguisticGraphVertex v = 0; - while (v != sentenceEnd && !toVisit.empty()) - - { - v = toVisit.dequeue(); - LDEBUG << "ConllDumper::process Vertex index : " << v; - visited.insert(v); - segmentationMapping.insert(std::make_pair(v,tokenId)); - segmentationMappingReverse.insert(std::make_pair(tokenId,v)); - LDEBUG << "ConllDumper::process conll id : " << tokenId << " Lima id : " << v; - DependencyGraphVertex dcurrent = syntacticData->depVertexForTokenVertex(v); - DependencyGraphOutEdgeIt dit, dit_end; - boost::tie(dit,dit_end) = boost::out_edges(dcurrent,*depGraph); - for (; dit != dit_end; dit++) - { - LDEBUG << "ConllDumper::process Dumping dependency edge " << (*dit).m_source << " -> " << (*dit).m_target; - try - { - CEdgeDepRelTypePropertyMap typeMap = get(edge_deprel_type, *depGraph); - SyntacticRelationId type = typeMap[*dit]; - std::string syntRelName=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getSyntacticRelationName(type); - LDEBUG << "ConllDumper::process relation = " << syntRelName; - LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); - LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); - LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; - LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); - LinguisticGraphVertex dest = syntacticData->tokenVertexForDepVertex(boost::target(*dit, *depGraph)); - LDEBUG << "ConllDumper::process Targ : Morph vertex= " << dest; - if (syntRelName!="") - { - LDEBUG << "ConllDumper::process saving target for" << v << ":" << dest << syntRelName; - vertexDependencyInformations.insert(std::make_pair(v, std::make_pair(dest,syntRelName))); - } - } - catch (const std::range_error& ) - { - } - catch (...) - { - LDEBUG << "ConllDumper::process: catch others....."; - throw; - } - } - if (v == sentenceEnd) - { - continue; - } - LinguisticGraphOutEdgeIt outItr,outItrEnd; - for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) - { - LinguisticGraphVertex next=boost::target(*outItr,*graph); - if (!visited.contains(next) && next != tokenList->lastVertex()) - { - toVisit.enqueue(next); - } - } - ++tokenId; - } - - // instead of looking to all vertices, follow the graph (in - // morphological graph, some vertices are not related to main graph: - // idiomatic expressions parts and named entity parts) - - toVisit.clear(); - visited.clear(); - - sentenceBegin=sbItr->getFirstVertex(); - sentenceEnd=sbItr->getLastVertex(); - - // get the list of predicates for the current sentence - QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); - LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; - - toVisit.enqueue(sentenceBegin); - tokenId=0; - v=0; - while (!toVisit.empty() && v!=sentenceEnd) - { //as long as there are vertices in the sentence - v = toVisit.dequeue(); - - Token* ft=get(vertex_token,*graph,v); - MorphoSyntacticData* morphoData=get(vertex_data,*graph, v); - LDEBUG << "ConllDumper::process PosGraph token" << v; - if( morphoData!=0 && !morphoData->empty() && ft != 0) - { - const QString graphTag=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); - LDEBUG << "ConllDumper::process graphTag:" << graphTag; - - std::string inflectedToken=ft->stringForm().toUtf8().constData(); - std::string lemmatizedToken; - if (morphoData != 0 && !morphoData->empty()) - { - lemmatizedToken=sp[(*morphoData)[0].lemma].toUtf8().constData(); - } - - QString conllRelName = "-"; - int targetConllId = 0; - if (vertexDependencyInformations.count(v)!=0) - { - LinguisticGraphVertex target=vertexDependencyInformations.find(v)->second.first; - LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; - targetConllId=segmentationMapping.find(target)->second; - LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; - QString relName = QString::fromUtf8(vertexDependencyInformations.find(v)->second.second.c_str()); - LDEBUG << "ConllDumper::process the lima dependency tag for " - << ft->stringForm()<< " is " << relName; - if (m_d->m_conllLimaDepMapping.contains(relName)) - { - conllRelName=m_d->m_conllLimaDepMapping[relName]; - } - else - { - LERROR << "ConllDumper::process" << relName << "not found in mapping"; - } - } - QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "-"; - dstream->out() << tokenId << "\t"<< inflectedToken << "\t" - << lemmatizedToken << "\t" << graphTag << "\t" - << graphTag << "\t" << "-" << "\t" << targetConllIdString << "\t" - << conllRelName.toUtf8().constData() << "\t-\t-"; - if (!predicates.isEmpty()) - { - dstream->out() << "\t"; - LDEBUG << "ConllDumper::process output the predicate if any"; - if (!predicates.contains(v)) - { - // No predicate for this token - dstream->out() << "-"; - } - else - { - // This token is a predicate, output it - QString predicateAnnotation = annotationData->stringAnnotation(predicates.value(v),"Predicate"); - dstream->out() << predicateAnnotation; - } - // Now output the roles supported by the current PoS graph token - - LDEBUG << "ConllDumper::process output the roles for the" << predicates.keys().size() << "predicates"; - for (int i = 0; i < predicates.keys().size(); i++) - { - // There will be one column for each predicate. Output the - // separator right now - dstream->out() << "\t"; - AnnotationGraphVertex predicateVertex = predicates.value(predicates.keys()[i]); - - std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); - if (vMatches.empty()) - { - LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '-'."; - dstream->out() << "-"; - } - else - { - LDEBUG << "ConllDumper::process there is"<getGraph()); - for (; vMatchInEdgesIt != vMatchInEdgesIt_end; vMatchInEdgesIt++) - { - AnnotationGraphVertex inVertex = boost::source(*vMatchInEdgesIt, annotationData->getGraph()); - std::set< LinguisticGraphVertex > inVertexAnnotPosGraphMatches = annotationData->matches("annot",inVertex,"PosGraph"); - if (inVertex == predicateVertex && !inVertexAnnotPosGraphMatches.empty()) - { - // Current edge is holding a role of the current predicate - roleAnnotation = annotationData->stringAnnotation(*vMatchInEdgesIt,"SemanticRole"); - break; - } - else - { - // Current edge does not hold a role of the current predicate -// dstream->out() << "-"; - } - } - if (roleAnnotation != "-") break; - } - dstream->out() << roleAnnotation.toUtf8().constData(); - } - } - } - dstream->out() << std::endl; - } - - if (v == sentenceEnd) - { - continue; - } - LDEBUG << "ConllDumper::process look at out edges of" << v; - LinguisticGraphOutEdgeIt outIter,outIterEnd; - for (boost::tie(outIter,outIterEnd) = boost::out_edges(v,*graph); outIter!=outIterEnd; outIter++) - { - LinguisticGraphVertex next = boost::target(*outIter,*graph); - LDEBUG << "ConllDumper::process looking out vertex" << next; - if (!visited.contains(next)) - { - LDEBUG << "ConllDumper::process enqueuing" << next; - visited.insert(next); - toVisit.enqueue(next); - } - } - tokenId++; - } - dstream->out() << std::endl; - limaConllTokenIdMapping->insert(std::make_pair(sentenceNb, segmentationMappingReverse)); - sbItr++; - } - - return SUCCESS_ID; - -} - -QMultiMap ConllDumperPrivate::collectPredicateTokens( - Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd) -{ - DUMPERLOGINIT; - QMap result; - - AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); - - AnalysisGraph* tokenList=static_cast(analysis.getData(m_graph)); - if (tokenList==0) { - LERROR << "graph " << m_graph << " has not been produced: check pipeline"; - return result; - } - LinguisticGraph* graph=tokenList->getGraph(); - - - QQueue toVisit; - QSet visited; - toVisit.enqueue(sentenceBegin); - LinguisticGraphVertex v = 0; - while (v!=sentenceEnd && !toVisit.empty()) - { - v = toVisit.dequeue(); - LDEBUG << "ConllDumperPrivate::collectPredicateTokens vertex:" << v; - visited.insert(v); - - std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); - for (auto it = vMatches.begin(); it != vMatches.end(); it++) - { - AnnotationGraphVertex vMatch = *it; - if (annotationData->hasStringAnnotation(vMatch,"Predicate")) - { - LDEBUG << "ConllDumperPrivate::collectPredicateTokens insert" << v << vMatch; - result.insert(v, vMatch); - } - } - LinguisticGraphOutEdgeIt outItr,outItrEnd;bool newSentence(const QString & line); - for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) - { - LinguisticGraphVertex next=boost::target(*outItr,*graph); - if (!visited.contains(next) && next != tokenList->lastVertex()) - { - toVisit.enqueue(next); - } - } - } - return result; -} - -} // end namespace -} // end namespace -} // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp index 35d2a67cb..48d2e4743 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.cpp @@ -489,7 +489,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() { relationsToFollow.insert("SujInv"); relationsToFollow.insert("TIl"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "NV"); if(newGrp == 0) { @@ -536,7 +536,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() else if ( forme->micro == "ADV" && forme->hasOutRelation("AdvSub") ) { relationsToFollow.insert("AdvSub"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "GP"); if (newGrp == 0) { @@ -565,7 +565,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() else if ( forme->micro == "NC" && forme->hasInRelation("SUBSUBJUX") ) { relationsToFollow.insert("SUBSUBJUX"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "GN", true); } else if ( forme->micro == "PROREL" ) @@ -644,7 +644,7 @@ void ConstituantAndRelationExtractor::constructionDesGroupes() { relationsToFollow.insert("PronReflVerbe"); relationsToFollow.insert("AuxCplPrev"); - // relationsToFollow.insert("TEMPCOMP"); + // relationsToFollow.insert("aux"); newGrp = createGroupe(forme, relationsToFollow, "NV"); } else if ( forme->micro == "PROREL" ) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp index 5dc610043..d358243e4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.cpp @@ -370,6 +370,7 @@ xmlOutput(std::ostream& out, { // no sentence bounds : dump all text at once xmlOutputVertices(out, + analysis, anagraph, posgraph, annotationData, @@ -394,12 +395,13 @@ xmlOutput(std::ostream& out, // if (sentenceEnd==posgraph->lastVertex()) { // continue; // } - + LDEBUG << "dump sentence between " << sentenceBegin << " and " << sentenceEnd; LDEBUG << "dump simple terms for this sentence"; ostringstream oss; xmlOutputVertices(oss, + analysis, anagraph, posgraph, annotationData, @@ -424,6 +426,7 @@ xmlOutput(std::ostream& out, void GenericXmlDumper:: xmlOutputVertices(std::ostream& out, + AnalysisContent& analysis, AnalysisGraph* anagraph, AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData, @@ -509,7 +512,7 @@ xmlOutputVertices(std::ostream& out, continue; }*/ ostringstream oss; - xmlOutputVertex(oss,(*d),anagraph,posgraph,annotationData,syntacticData, + xmlOutputVertex(oss,analysis,(*d),anagraph,posgraph,annotationData,syntacticData, sp,offset,visited,alreadyStoredVertices); uint64_t pos=(*it).first->position(); xmlOutputs[pos].push_back(oss.str()); @@ -526,6 +529,7 @@ xmlOutputVertices(std::ostream& out, void GenericXmlDumper:: xmlOutputVertex(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, AnalysisGraph* anagraph, AnalysisGraph* posgraph, @@ -545,7 +549,7 @@ xmlOutputVertex(std::ostream& out, se=checkSpecificEntity(v,anagraph,posgraph,annotationData); if (se.first!=0) { LDEBUG << "GenericXmlDumper: -- is a specific entity "; - if (xmlOutputSpecificEntity(out,se.first,se.second,sp,offset)) { + if (xmlOutputSpecificEntity(out,analysis,se.first,se.second,sp,offset)) { return; } else { @@ -561,7 +565,7 @@ xmlOutputVertex(std::ostream& out, if (compoundTokens.size()!=0) { for (auto it=compoundTokens.begin(), it_end=compoundTokens.end();it!=it_end;it++) { - xmlOutputCompound(out,(*it),anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,(*it),anagraph,posgraph,annotationData,sp,offset); std::set bowTokenVertices = (*it)->getVertices(); alreadyStoredVertices.insert(bowTokenVertices.begin(), bowTokenVertices.end()); } @@ -571,7 +575,7 @@ xmlOutputVertex(std::ostream& out, LDEBUG << "GenericXmlDumper: -- is simple word "; // if not a specific entity nor a compound, output simple word infos if (m_outputWords) { - xmlOutputVertexInfos(out, v, posgraph, offset); + xmlOutputVertexInfos(out, analysis, v, posgraph, offset); } } @@ -622,6 +626,7 @@ GenericXmlDumper::checkSpecificEntity(LinguisticGraphVertex v, bool GenericXmlDumper:: xmlOutputSpecificEntity(std::ostream& out, + AnalysisContent& analysis, const SpecificEntities::SpecificEntityAnnotation* se, LinguisticAnalysisStructure::AnalysisGraph* graph, const FsaStringsPool& sp, @@ -641,7 +646,7 @@ xmlOutputSpecificEntity(std::ostream& out, string value=xmlString(specificEntityFeature(se,m_featureNames[i],sp,offset)); if (value.empty()) { // otherwise, get features from head - value=xmlString(m_features[i]->getValue(graph,se->getHead())); + value=xmlString(m_features[i]->getValue(graph,se->getHead(),analysis)); } out << " " << m_featureTags[i] << "=\"" << value << "\""; } @@ -653,7 +658,7 @@ xmlOutputSpecificEntity(std::ostream& out, for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); m != se->m_vertices.end(); m++) { - xmlOutputVertexInfos(out,(*m),graph,offset); + xmlOutputVertexInfos(out,analysis,(*m),graph,offset); } out << "" << endl; } @@ -667,7 +672,7 @@ xmlOutputSpecificEntity(std::ostream& out, for (std::vector< LinguisticGraphVertex>::const_iterator m(se->m_vertices.begin()); m != se->m_vertices.end(); m++) { - xmlOutputVertexInfos(out,(*m),graph,offset); + xmlOutputVertexInfos(out,analysis,(*m),graph,offset); } } @@ -732,6 +737,7 @@ checkCompound(LinguisticGraphVertex v, void GenericXmlDumper:: xmlOutputCompound(std::ostream& out, + AnalysisContent& analysis, boost::shared_ptr token, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, @@ -742,12 +748,12 @@ xmlOutputCompound(std::ostream& out, DUMPERLOGINIT; LDEBUG << "GenericXmlDumper: output BoWToken [" << token->getOutputUTF8String() << "]"; switch (token->getType()) { - case BOW_PREDICATE:{ + case BoWType::BOW_PREDICATE:{ // FIXME To implement - LERROR << "GenericXmlDumper: BOW_PREDICATE support not implemented"; + LERROR << "GenericXmlDumper: BoWType::BOW_PREDICATE support not implemented"; break; } - case BOW_TERM: { + case BoWType::BOW_TERM: { LDEBUG << "GenericXmlDumper: output BoWTerm"; // compound informations out << "<" << m_compoundTag; @@ -775,7 +781,7 @@ xmlOutputCompound(std::ostream& out, while (! bit.isAtEnd()) { boost::shared_ptr< AbstractBoWElement > tok=bit.getElement(); LDEBUG << "next token=" << tok->getOutputUTF8String(); - xmlOutputCompound(out,tok,anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,tok,anagraph,posgraph,annotationData,sp,offset); bit++; } } @@ -784,7 +790,7 @@ xmlOutputCompound(std::ostream& out, boost::shared_ptr< BoWTerm > term=boost::dynamic_pointer_cast(token); const std::deque< BoWComplexToken::Part >& parts=term->getParts(); for (auto p=parts.begin(),p_end=parts.end();p!=p_end;p++) { - xmlOutputCompound(out,(*p).getBoWToken(),anagraph,posgraph,annotationData,sp,offset); + xmlOutputCompound(out,analysis,(*p).getBoWToken(),anagraph,posgraph,annotationData,sp,offset); } } @@ -793,7 +799,7 @@ xmlOutputCompound(std::ostream& out, } break; } - case BOW_NAMEDENTITY: { + case BoWType::BOW_NAMEDENTITY: { if (m_outputCompoundParts) { LinguisticGraphVertex v=boost::dynamic_pointer_cast(token)->getVertex(); LDEBUG << "GenericXmlDumper: output BoWNamedEntity of vertex " << v; @@ -804,28 +810,29 @@ xmlOutputCompound(std::ostream& out, LERROR << "GenericXmlDumper: for vertex " << v << ": specific entity not found"; } else { - xmlOutputSpecificEntity(out,se.first,se.second,sp,offset); + xmlOutputSpecificEntity(out,analysis,se.first,se.second,sp,offset); } } break; } - case BOW_TOKEN: { + case BoWType::BOW_TOKEN: { if (m_outputCompoundParts) { LinguisticGraphVertex v=boost::dynamic_pointer_cast(token)->getVertex(); LDEBUG << "GenericXmlDumper: output BoWToken of vertex " << v; - xmlOutputVertexInfos(out,v,posgraph,offset); + xmlOutputVertexInfos(out,analysis,v,posgraph,offset); } break; } default: { DUMPERLOGINIT; - LERROR << "GenericXmlDumper: Error: BowToken has type BOW_NOTYPE"; + LERROR << "GenericXmlDumper: Error: BowToken has type BoWType::BOW_NOTYPE"; } } } void GenericXmlDumper::xmlOutputVertexInfos(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, LinguisticAnalysisStructure::AnalysisGraph* graph, uint64_t offset) const @@ -835,14 +842,14 @@ void GenericXmlDumper::xmlOutputVertexInfos(std::ostream& out, std::string value; // for position, correct with offset : hard coded name if (m_features[i]->getName()=="position") { - unsigned int pos=atoi(m_features[i]->getValue(graph,v).c_str()); + unsigned int pos=atoi(m_features[i]->getValue(graph,v,analysis).c_str()); pos+=offset; ostringstream oss; oss << pos; value=oss.str(); } else { - value=xmlString(m_features[i]->getValue(graph,v)); + value=xmlString(m_features[i]->getValue(graph,v,analysis)); } out << " " << m_featureTags[i] << "=\"" << value << "\""; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h index bb0691e26..3ea594732 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/GenericXmlDumper.h @@ -107,6 +107,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const SyntacticAnalysis::SyntacticData* syntacticData) const; void xmlOutputVertices(std::ostream& out, + AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Common::AnnotationGraphs::AnnotationData* annotationData, @@ -117,6 +118,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const uint64_t offset) const; void xmlOutputVertex(std::ostream& out, + AnalysisContent& analysis, LinguisticGraphVertex v, LinguisticAnalysisStructure::AnalysisGraph* anagraph, LinguisticAnalysisStructure::AnalysisGraph* posgraph, @@ -127,10 +129,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly std::set& visited, std::set& alreadyStoredVertices) const; - void xmlOutputVertexInfos(std::ostream& out, - LinguisticGraphVertex v, - LinguisticAnalysisStructure::AnalysisGraph* anagraph, - uint64_t offset) const; + void xmlOutputVertexInfos(std::ostream& out, Lima::AnalysisContent& analysis, LinguisticGraphVertex v, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* graph, uint64_t offset) const; void xmlOutputBoWInfos(std::ostream& out, Common::BagOfWords::AbstractBoWElement* token, @@ -149,6 +148,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly const Common::AnnotationGraphs::AnnotationData* annotationData) const; bool xmlOutputSpecificEntity(std::ostream& out, + AnalysisContent& analysis, const SpecificEntities::SpecificEntityAnnotation* se, LinguisticAnalysisStructure::AnalysisGraph* anagraph, const FsaStringsPool& sp, @@ -170,7 +170,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT GenericXmlDumper : public AbstractTextualAnaly uint64_t offset, std::set& visited) const; - void xmlOutputCompound(std::ostream& out, boost::shared_ptr token, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* anagraph, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Lima::Common::AnnotationGraphs::AnnotationData* annotationData, const Lima::FsaStringsPool& sp, uint64_t offset) const; + void xmlOutputCompound(std::ostream& out, + AnalysisContent& analysis, + boost::shared_ptr token, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* anagraph, Lima::LinguisticProcessing::LinguisticAnalysisStructure::AnalysisGraph* posgraph, const Lima::Common::AnnotationGraphs::AnnotationData* annotationData, const Lima::FsaStringsPool& sp, uint64_t offset) const; /*void xmlOutputVertexInfos(std::ostream& out, const LinguisticAnalysisStructure::Token* ft, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp index 970c05dff..8edbb52ec 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.cpp @@ -78,10 +78,28 @@ LTRTextBuilder::LTRTextBuilder( void LTRTextBuilder::buildLTRTextFrom( const LinguisticGraph& graph, SegmentationData* sb, + const LinguisticGraphVertex& graphFirstVertex, const LinguisticGraphVertex& graphLastVertex, LTR_Text* textRep, uint64_t offset) { + if (sb==0) { + // no segmentation data: add tokens from all text + uint64_t tokenCounter = 0; + this->addTokensToLTRTextFrom( + graph, + graphFirstVertex, // from first vertex + graphLastVertex, // to last vertex + graphLastVertex, + textRep, + offset, + &tokenCounter); + // add a global sentence boundary (thay covers all the text) + DUMPERLOGINIT; + LDEBUG << "LTR: add sentence bound at token" << tokenCounter; + textRep->addSentenceBound(tokenCounter); + } + else { // ??OME2 SegmentationData::iterator sbIt = sb->begin(); std::vector::iterator sbIt = (sb->getSegments()).begin(); uint64_t tokenCounter = 0; @@ -91,8 +109,8 @@ void LTRTextBuilder::buildLTRTextFrom( LinguisticGraphVertex sentenceEnd = sbIt->getLastVertex(); this->addTokensToLTRTextFrom( graph, - sentenceBegin, - sentenceEnd, + sentenceBegin, // from sentence beginning + sentenceEnd, // to sentence end graphLastVertex, textRep, offset, @@ -100,6 +118,7 @@ void LTRTextBuilder::buildLTRTextFrom( textRep->addSentenceBound(tokenCounter); sbIt ++; } + } } void LTRTextBuilder::addTokensToLTRTextFrom( diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h index f2ca6ed41..cdd776dd8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/LTRTextBuilder.h @@ -87,9 +87,18 @@ class LIMA_ANALYSISDUMPERS_EXPORT LTRTextBuilder { LTRTextBuilder( const MediaId& language, StopList* stopList); + /** @brief build a LTRText representation of the analyzed text + * @param graph the linguistic graph containing the analyzed text + * @param sb a pointer on the sentence boundaries segmentation data: if zero, sentence boundaries are ignored, all text is treated as a single segment + * @param graphFirstVertex the first vertex of the text in the linguistic graph (needed when sb==0) + * @param graphLastVertex the last vertex of the text in the linguistic graph (for last segment) + * @param textRep the LTRText built + * @param offset the offset of the text in the document (to have a global correct position) + */ void buildLTRTextFrom( const LinguisticGraph& graph, Lima::LinguisticProcessing::SegmentationData* sb, + const LinguisticGraphVertex& graphFirstVertex, const LinguisticGraphVertex& graphLastVertex, Lima::Common::BagOfWords::LTR_Text* textRep, uint64_t offset); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp index 9ab26e25c..2777d2bb7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/StopList.cpp @@ -24,6 +24,7 @@ #include "StopList.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" @@ -63,10 +64,10 @@ void StopList::init( LIMA_UNUSED(manager); DUMPERLOGINIT; const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string stopListFileName; + QString stopListFileName; try { - stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file"); + stopListFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("file").c_str()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -74,7 +75,7 @@ void StopList::init( throw InvalidConfiguration(); } - std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary); + std::ifstream stopListFile(stopListFileName.toUtf8().constData(), std::ifstream::binary); if (!stopListFile) { LERROR << "invalid file " << stopListFileName; throw InvalidConfiguration(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp index ae3dc05b8..fad91d30d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.cpp @@ -109,7 +109,7 @@ void TextFeaturesDumper::init(Common::XMLConfigurationFiles::GroupConfigurationS } LimaStatusCode TextFeaturesDumper::process( - AnalysisContent& analysis) const + AnalysisContent& analysis) const { DUMPERLOGINIT; LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); @@ -166,7 +166,7 @@ LimaStatusCode TextFeaturesDumper::process( ftItr!=categoriesMapping.end(); ftItr++) { - outputVertex(dstream->out(),anagraph,ftItr->second,metadata->getStartOffset()); + outputVertex(dstream->out(),anagraph,ftItr->second,analysis,metadata->getStartOffset()); } delete dstream; @@ -175,10 +175,7 @@ LimaStatusCode TextFeaturesDumper::process( void TextFeaturesDumper:: -outputVertex(std::ostream& out, - const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v, - uint64_t /*offset*/) const +outputVertex(ostream& out, const AnalysisGraph* graph, LinguisticGraphVertex v, AnalysisContent& analysis, uint64_t offset /*offset*/) const { //TODO : use offset bool first=true; @@ -190,7 +187,7 @@ outputVertex(std::ostream& out, out << m_sep; } // take only first morphosyntactic data - string str=(*it)->getValue(graph,v); + string str=(*it)->getValue(graph,v,analysis); boost::replace_all(str,m_sep,m_sepReplace); out << str; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h index 4d517f1e3..8d9874a95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/TextFeaturesDumper.h @@ -70,6 +70,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT TextFeaturesDumper : public AbstractTextualAna void outputVertex(std::ostream& out, const LinguisticAnalysisStructure::AnalysisGraph* graph, LinguisticGraphVertex v, + AnalysisContent& analysis, uint64_t offset=0) const; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp index 3e4e79218..6059b010c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.cpp @@ -63,6 +63,8 @@ FeatureExtractorFactory FeatureLemmaFactory(FeatureLemma_ID); FeatureExtractorFactory FeaturePropertyFactory(FeatureProperty_ID); FeatureExtractorFactory FeatureTstatusFactory(FeatureTstatus_ID); FeatureExtractorFactory FeatureSpecificEntityFactory(FeatureSpecificEntity_ID); +FeatureExtractorFactory FeatureLemmaSpecificEntityFactory(FeatureLemmaSpecificEntity_ID); +FeatureExtractorFactory FeatureStoredDataFactory(FeatureStoredData_ID); //*********************************************************************** // Feature list @@ -79,12 +81,12 @@ m_language(language) WordFeatures::~WordFeatures() { - for (WordFeatures::iterator it=begin(),it_end=end(); it!=it_end; it++) { - if (*it) { - delete (*it); - *it=0; - } - } +// for (WordFeatures::iterator it=begin(),it_end=end(); it!=it_end; it++) { +// if (*it) { +// delete (*it); +// *it=0; +// } +// } } void WordFeatures::initialize(const deque& featureNames) @@ -115,7 +117,8 @@ AbstractFeatureExtractor(language,complement) std::string FeaturePosition:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -133,7 +136,9 @@ AbstractFeatureExtractor(language,complement) std::string FeatureToken:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/ + ) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -152,7 +157,9 @@ m_sp() std::string FeatureLemma:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/ + ) const { MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); if (data==0) { @@ -180,7 +187,8 @@ m_propertyManager(0) std::string FeatureProperty:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); if (data==0) { @@ -203,7 +211,8 @@ AbstractFeatureExtractor(language,complement) std::string FeatureTstatus:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const + LinguisticGraphVertex v, + AnalysisContent & /*unused*/) const { Token* token=get(vertex_token,*(graph->getGraph()),v); if (token==0) { @@ -220,70 +229,96 @@ AbstractFeatureExtractor(language,complement) } std::string FeatureSpecificEntity:: -getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis + ) const { - std::string typeName(""); - std::map::const_iterator itMSS; - int isPresent; - - std::set< AnnotationGraphVertex > anaVertices = annot->matches("PosGraph",v,"AnalysisGraph"); - if (anaVertices.size()==0) { - return "NAN" ; - } - // note: anaVertices size should be 0 or 1 - for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); - anaVerticesIt != anaVertices.end(); anaVerticesIt++) + std::string typeName("NAN"); + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + + std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) + { + if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { - std::set< AnnotationGraphVertex > matches = annot->matches("AnalysisGraph",*anaVerticesIt,"annot"); - for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); - it != matches.end(); it++) - { - AnnotationGraphVertex vx=*it; - if (annot->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - const SpecificEntityAnnotation* se = - annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). - pointerValue(); - try { - LimaString str= MediaticData::single().getEntityName(se->getType()); - typeName=Common::Misc::limastring2utf8stdstring(str); - } - catch (std::exception& ) { - DUMPERLOGINIT; - LERROR << "Undefined entity type " << se->getType() << LENDL; - LERROR << "failed to output specific entity for vertex " << v << LENDL; - } - - - } else { - // we don't find any entity - return "NAN"; - } - } + AnnotationGraphVertex vx=*it; + const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + + LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + typeName=Common::Misc::limastring2utf8stdstring(str); } - - // Test if the finded type is selected - isPresent=0; // by default, an unfinded entity isn't dumped - itMSS=m_NEauthorized.find(typeName); - if (itMSS!=m_NEauthorized.end()) { - isPresent=atoi(((*itMSS).second).c_str()); - } - if (isPresent) { - return typeName; - } else { - return "NAN"; } + return typeName; } - -void FeatureSpecificEntity::setNEauthorized(std::map mp) { - - m_NEauthorized = mp; +//*********************************************************************** +FeatureLemmaSpecificEntity::FeatureLemmaSpecificEntity(MediaId language, const std::string& complement): +AbstractFeatureExtractor(language,complement) +{ } +std::string FeatureLemmaSpecificEntity:: +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis +) const +{ + std::string mxvalue("NAN"); + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + + std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) + { + if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + AnnotationGraphVertex vx=*it; + const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + + LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + mxvalue=Common::Misc::limastring2utf8stdstring(str); + } + } + // replace NAN values by lemmas + if (mxvalue == "NAN") { + MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); + // take first + for (MorphoSyntacticData::const_iterator it=data->begin(),it_end=data->end();it!=it_end;it++) { + mxvalue = Common::Misc::limastring2utf8stdstring((*&(Common::MediaticData::MediaticData::single().stringsPool(m_language)))[(*it).normalizedForm]); + break; + } + } + // replace empty lemma values by tokens + if (mxvalue == "" ) { + Token* token=get(vertex_token,*(graph->getGraph()),v); + mxvalue = Common::Misc::limastring2utf8stdstring(token->stringForm()); + } + + return mxvalue; +} +//*********************************************************************** +FeatureStoredData::FeatureStoredData(MediaId language, const std::string& complement): +AbstractFeatureExtractor(language,complement) +{ +} +std::string FeatureStoredData:: +getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &analysis) const +{ + Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); + Token* token=get(vertex_token,*(graph->getGraph()),v); + if (token==0) { + return ""; + } + ostringstream oss; + oss << token->position() ; + return oss.str(); +} } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h index 433aa2b59..7bfe5a843 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/WordFeatureExtractor.h @@ -50,7 +50,7 @@ class LIMA_ANALYSISDUMPERS_EXPORT AbstractFeatureExtractor virtual std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const=0; + LinguisticGraphVertex v, AnalysisContent &analysis) const=0; const std::string& getName() { return m_name; } void setName(const std::string& name) { m_name=name; } @@ -112,7 +112,8 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeaturePosition : public AbstractFeatureExtrac std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & ) const; }; //---------------------------------------------------------------------- @@ -124,7 +125,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureToken : public AbstractFeatureExtractor std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; }; //---------------------------------------------------------------------- @@ -136,7 +139,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureLemma : public AbstractFeatureExtractor std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; private: const FsaStringsPool* m_sp; }; @@ -150,7 +155,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureProperty : public AbstractFeatureExtrac std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; private: std::string m_propertyName; const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; @@ -166,7 +173,9 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureTstatus : public AbstractFeatureExtract std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; + LinguisticGraphVertex v, + AnalysisContent & + ) const; }; //-------------------------------------------------------- @@ -178,15 +187,35 @@ class LIMA_ANALYSISDUMPERS_EXPORT FeatureSpecificEntity : public AbstractFeature ~FeatureSpecificEntity() {} std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, - LinguisticGraphVertex v) const; - - void setNEauthorized(std::map mp); - - Common::AnnotationGraphs::AnnotationData* annot; - std::map m_NEauthorized; + LinguisticGraphVertex v, + AnalysisContent &) const; +}; +//-------------------------------------------------------- +#define FeatureLemmaSpecificEntity_ID "lemmaSpecificEntity" +class LIMA_ANALYSISDUMPERS_EXPORT FeatureLemmaSpecificEntity : public AbstractFeatureExtractor +{ +public: + FeatureLemmaSpecificEntity(MediaId language, const std::string& complement=""); + ~FeatureLemmaSpecificEntity() {} + + std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &) const; }; +//-------------------------------------------------------- +#define FeatureStoredData_ID "storedData" +class LIMA_ANALYSISDUMPERS_EXPORT FeatureStoredData : public AbstractFeatureExtractor +{ +public: + FeatureStoredData(MediaId language, const std::string& complement=""); + ~FeatureStoredData() {} + + std::string getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, + LinguisticGraphVertex v, + AnalysisContent &) const; +}; } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp index 7234afa11..19b882a98 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationDumper.cpp @@ -130,8 +130,8 @@ LimaStatusCode LinearTextRepresentationDumper::process( // get sentence boundaries SegmentationData* sb = dynamic_cast(analysis.getData("SentenceBoundaries")); if (sb == 0) { - LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort"; - return MISSING_DATA; + LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored"; + // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there } // build LTRText LTR_Text textRep; @@ -139,6 +139,7 @@ LimaStatusCode LinearTextRepresentationDumper::process( builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, + anaGraph->firstVertex(), anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp index b2a68bcce..1f692e1e8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/linearTextRepresentationLogger.cpp @@ -129,8 +129,8 @@ LimaStatusCode LinearTextRepresentationLogger::process( // get sentence boundaries SegmentationData* sb = dynamic_cast(analysis.getData("SentenceBoundaries")); if (sb == 0) { - LERROR << "no SentenceBounds ! abort"; - return MISSING_DATA; + LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored"; + // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there } // build LTRText LTR_Text textRep; @@ -138,6 +138,7 @@ LimaStatusCode LinearTextRepresentationLogger::process( builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, + anaGraph->firstVertex(), anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp index 76a93e57b..2d47416b1 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/ApplyRecognizer/applyRecognizerActions.cpp @@ -315,6 +315,13 @@ operator()(RecognizerMatch& result, // create the new token pair newToken= createAlternativeToken(result); + if (newToken.second->empty()) { + APPRLOGINIT; + LERROR << "CreateAlternative::operator(): Got empty morphosyntactic data. Abort."; + delete newToken.first; + delete newToken.second; + return false; + } // LDEBUG << "create alternative token " << newToken.first->stringForm(); // add the vertex @@ -351,6 +358,13 @@ operator()(RecognizerMatch& result, // create the new token pair newToken= createAlternativeToken(result); + if (newToken.second->empty()) { + APPRLOGINIT; + LERROR << "CreateAlternative::operator(): Got empty morphosyntactic data. Abort."; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex altVertex = @@ -374,6 +388,14 @@ operator()(RecognizerMatch& result, // LDEBUG << "duplication vertex " << matchItr->getVertex();; Token* token=get(vertex_token,*graph,matchItr->getVertex()); MorphoSyntacticData* data=new MorphoSyntacticData(*get(vertex_data,*graph,matchItr->getVertex())); + if (data->empty()) + { + // ignore current idiomatic expression, continue + APPRLOGINIT; + LERROR << "CreateAlternative::operator() Got empty morphosyntactic data. Abort"; + delete data; + return false; + } LinguisticGraphVertex dupVx=add_vertex(*graph); put(vertex_token,*graph,dupVx,token); put(vertex_data,*graph,dupVx,data); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp index cf3ecd874..035f134e7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.cpp @@ -75,6 +75,13 @@ bool EntityFeature::operator==(const EntityFeature& f) const return (boost::any_cast(m_value)==boost::any_cast(f.m_value)); } if (type==typeid(LimaString)) { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT + LDEBUG << "EntityFeature::operator==(EntityFeature& f): f.value: " << boost::any_cast(f.m_value); + LDEBUG << "EntityFeature::operator==(EntityFeature& f): this.value: " << boost::any_cast(m_value); +#endif +#endif return (boost::any_cast(m_value)==boost::any_cast(f.m_value)); } if (type==typeid(double)) { @@ -263,6 +270,18 @@ std::ostream& operator<<(std::ostream& os, const EntityFeatures& f) { return os; } +QDebug& operator<<(QDebug& os, const EntityFeatures& f) { + if (f.empty()) { + return os; + } + EntityFeatures::const_iterator it=f.begin(),it_end=f.end(); + os << (*it).getName() << "=" << (*it).getValueString(); + for (it++; it!=it_end; it++) { + os << "/" << (*it).getName() << "=" << (*it).getValueString(); + } + return os; +} + } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h index e9dc3873c..0ab303cf6 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/EntityFeatures.h @@ -124,19 +124,32 @@ class LIMA_AUTOMATON_EXPORT EntityFeatures: public std::vector { EntityFeatures::iterator findLast(const std::string& featureName); friend LIMA_AUTOMATON_EXPORT std::ostream& operator<<(std::ostream& os, const EntityFeatures& f); + friend LIMA_AUTOMATON_EXPORT QDebug& operator<<(QDebug& os, const EntityFeatures& f); }; +#if defined(WIN32) + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const int& ); + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const double& ); + extern template LIMA_AUTOMATON_EXPORT void EntityFeatures::appendFeature(const std::string&, const QString& ); +#endif + template void EntityFeatures::setFeature(const std::string& name, const ValueType& value) { - SELOGINIT; - LDEBUG << "EntityFeatures::setFeature(" << name << "," << value << ")"; +// SELOGINIT; +// LDEBUG << "EntityFeatures::setFeature(" << name << "," << value << ")"; // if feature with same name already exists, overwrite it EntityFeatures::iterator it=find(name); if (it!=end()) { // if( (it!=end()) && (name==DEFAULT_ATTRIBUTE) ){ (*it).setValue(boost::any(value)); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::setFeature(" << name << "," << (*it).getValueString() << ")"; +#endif +#endif } else { //push empy feature and set values to avoid two copies @@ -144,18 +157,30 @@ template push_back(EntityFeature()); back().setName(name); back().setValue(boost::any(value)); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::setFeature(" << name << "," << back().getValueString() << ")"; +#endif +#endif } } template void EntityFeatures::addFeature(const std::string& name, const ValueType& value) { - SELOGINIT; - LDEBUG << "EntityFeatures::addFeature(" << name << "," << value << ")"; +// SELOGINIT; +// LDEBUG << "EntityFeatures::addFeature(" << name << "," << value << ")"; push_back(EntityFeature()); back().setName(name); back().setValue(boost::any(value)); - } +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "EntityFeatures::addFeature(" << name << "," << back().getValueString() << ")"; +#endif +#endif + } /* template void EntityFeatures::appendFeature(const std::string& name, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp index 96d7b6da5..73724e735 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/SpecificEntityAnnotation.cpp @@ -47,9 +47,17 @@ m_normalizedForm(0), m_position(entity.positionBegin()), m_length(entity.length()) { +#ifdef ANTINNO_SPECIFIC + LOGINIT("LP::Automaton"); + LDEBUG << "entity.features(): " << entity.features(); +#endif Automaton::EntityFeatures::const_iterator f=entity.features().find(DEFAULT_ATTRIBUTE); if (f!=entity.features().end()) { + #ifdef ANTINNO_SPECIFIC + LOGINIT("LP::Automaton"); + LDEBUG << "entity.features()[\"value\"]: " << boost::any_cast((*f).getValue()); + #endif m_normalizedForm=sp[boost::any_cast((*f).getValue())]; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp index cef11d732..bdebc2581 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.cpp @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include using namespace std; using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; @@ -49,6 +52,9 @@ namespace Automaton { #define DEFAULT_MAXNBRESULTS 50 #define DEFAULT_MAXRESULTSIZE 200 +// a structure to store the position of the search in the automaton +typedef std::pair,const Transition*> DFFSPos; + AutomatonControlParams::AutomatonControlParams(): m_maxDepthStack(DEFAULT_MAXDEPTHSTACK), m_maxTransitionsExplored(DEFAULT_MAXTRANSITIONSEXPLORED), @@ -233,18 +239,26 @@ void Automaton::initializeSearchStructures(MediaId language) { bool Automaton:: getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& vertex, AnalysisContent& analysis, + SearchGraph* searchGraph, const Tstate& state, - std::vector& - matchingTransitions) const { + std::vector& matchingTransitions, + const LinguisticGraphVertex& limit + ) const { Token* token = get(vertex_token, *(graph.getGraph()), vertex); MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), vertex); - if (m_searchStructures[state]==0) { -// AULOGINIT; +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "Automaton::getMatchingTransitions(vertex: " << vertex << ")"; // LDEBUG << "search structure not initialized: linear search"; +#endif + if (m_searchStructures[state]==0) { //linear search on the transitions +#ifdef DEBUG_LP + LDEBUG << "Automaton::getMatchingTransitions: search structure not initialized: linear search"; +#endif matchingTransitions.clear(); vector::const_iterator trans=m_transitions[state].begin(), @@ -252,19 +266,42 @@ getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, for (; trans!=trans_end; trans++) { // LDEBUG << "Automaton::getMatchingTransitions vertex: " << vertex; + deque noVertices; +#ifdef ANTINNO_SPECIFIC + DFFSPos newPair(noVertices,nullptr); +#else + DFFSPos newPair(noVertices,0); +#endif + bool match=(*trans).transitionUnit()->compare(graph,vertex,analysis,token,data); + const GazeteerTransition* gtrans = dynamic_cast((*trans).transitionUnit()); + // TODO: generalize buildNextTermsList and checkMultiTerms to be able to manage backtrack and backward + if( gtrans != 0 ) { + deque vertices; + match = gtrans->matchPath(graph, vertex, limit, searchGraph, analysis, token, vertices, data); + if( match ) { + newPair = DFFSPos(vertices,&(*trans)); + } + } + else { + deque singleton(1,vertex); + newPair = DFFSPos(singleton,&(*trans)); + } if ((*trans).transitionUnit()->negative()) { match = (!match); } if (match) { - matchingTransitions.push_back(&(*trans)); + matchingTransitions.push_back(newPair); } } return (!matchingTransitions.empty()); } else { +#ifdef DEBUG_LP + LDEBUG << "Automaton::getMatchingTransitions: search structure initialized find"; +#endif return m_searchStructures[state]-> - findMatchingTransitions(graph,vertex,analysis,token,data,matchingTransitions); + findMatchingTransitions2(graph,vertex,limit,searchGraph,analysis,token,data,matchingTransitions); } } @@ -311,6 +348,7 @@ operator()(const AutomatonMatch& r1, // internal definition of a utility class: // stack for DFS test function + class Automaton::DFSStack { public: DFSStack(const Automaton& a, @@ -333,24 +371,25 @@ class Automaton::DFSStack { bool isEndVertex(const LinguisticGraphVertex& v) const { return (v==m_searchGraph->endOfGraph(m_graph)); } - std::pair top(); - void popVertex(); + // std::pair top(); + DFFSPos top(); + /* TODO: usefull? + * void popVertex(); + */ bool pop(); bool push(const LinguisticGraphVertex& vertex, const Tstate& state, - AnalysisContent& analysis); + AnalysisContent& analysis, + const LinguisticGraphVertex& limit); private: struct DFSStackElement { - DFSStackElement(LinguisticGraphVertex v, - const std::vector& t): - m_vertex(v), - m_transitions(t), - m_transition(t.begin()) + DFSStackElement( std::vector& matchingTransitions): + m_transitions(matchingTransitions), + m_transition(matchingTransitions.begin()) { } DFSStackElement(const DFSStackElement& elt): - m_vertex(elt.m_vertex), m_transitions(elt.m_transitions), m_transition(m_transitions.begin()) { @@ -358,9 +397,10 @@ class Automaton::DFSStack { ~DFSStackElement() {} - LinguisticGraphVertex m_vertex; - std::vector m_transitions; - std::vector::const_iterator m_transition; + std::vector m_transitions; + //std::vector > m_transitions; + std::vector::const_iterator m_transition; + //std::vector >::const_iterator m_transition; }; std::vector m_stack; const Automaton& m_automaton; @@ -369,16 +409,15 @@ class Automaton::DFSStack { LinguisticGraphVertex m_limit; }; -std::pair -Automaton::DFSStack::top() { +//std::pair +DFFSPos Automaton::DFSStack::top() { // AULOGINIT; // LDEBUG << "Automaton:DFSSTack: top " // << "transition=" << *(m_stack.back().m_transition) // << ";transitionUnit=" // << (*(m_stack.back().m_transition))->transitionUnit() // ; - return make_pair(m_stack.back().m_vertex, - *(m_stack.back().m_transition)); + return *(m_stack.back().m_transition); } bool Automaton::DFSStack::pop() { @@ -395,14 +434,68 @@ bool Automaton::DFSStack::pop() { return false; } -void Automaton::DFSStack::popVertex() { +/* TODO usefull? + * void Automaton::DFSStack::popVertex() { m_stack.pop_back(); } - +*/ +/* + * fill the stack with pairs (nextV,matchingTransition) + * nextV is one of the successor nodes in the graph + * The function look for possible transition from state + * and select matchingTransition = set of transition which succeed with nextV + */ +/* + * Pour remplir la pile, on itére sur les outVertex, + * puis pour chaque vertex, on regarde quelles transitions obtiennent un succès + * Cela ressemble à l'initialisation d'un mode largeur d'abord... + * En fait, c'est simplement pour limiter la taille de la structure de données qui gère le contexte de parcours. + * Le parcours se fait en profondeur d'abord (DFS Deep First Search) + * conforme au nom de la pile DFSStack. + * + * Le parcours se fait en profondeur d'abord sur le graphe d'analyse, limité sur plusieurs aspects: + * - les limites du graphe (begin, end), c'est à dire les noeuds 0 et 1 qui terminent le treillis. + * (si le parcours se fait en avant, limit = end, si le parcours se fait en arière, limit = begin) + * - la profondeur de la pile (pour éviter des traitements trop longs et des dépassements de pile sur + * des textes 'pathologiques', ex: des texts issus de tableaux) + * - le nombre de backtrack??? + * L'unité d'avancement dans ce parcours est le passage d'un noeud à l'un des noeuds successeurs + * dans le graphe d'analyse. De même dans les opérations de backtrack, on revient sur une étape de + * ce parcours. + * Si on souhaite intégrer les transitions de type GazetteerTransition, il faut pouvoir + * gérer une unité d'avancement différente: il faut envisager l'avancement sur plusieurs noeuds + * successifs du graphe lorsqu'il y a un match d'un élément multi-terme du gazetteer. De même le + * backtrack doit se faire jusqu'au point d'avancement précédent donc revenir en arrière sur + * plusieurs noeuds. + * Une pile sert à gérer le point d'avancement dans le parcours. + * Actuellement, pour remplir la pile, on itére sur les 'out vertex' puis pour chaque vertex, on regarde + * quelles transitions obtiennent un succès. Cela ne convient plus car on ne couvre pas le cas des noeuds + * atteints par les éléments multi-termes des gazeteer. + * En effet, pour une paire (out vertex, transition) qui décrit une possibilité d'avancement, l'exécution de + * la transition va nous faire avancer au delà du noeud 'out vertex' dans le cas des multi-terme. + * Toutes les transitions ne font pas atteindre le même noeud. + * On est donc obligé de modifier la structure de données de la pile qui gére le contexte de parcours et le + * backtrack. + * Changement: + + * On modifie seulement Automaton::getMatchingTransitions et la structure Automaton::DFSStack. + * On considère que nextVertex est la direction dans laquelle on va, mais la transition peut mener plus loin. + * On modifie DFSStackElement de la façon suivante: + * DFSStackElement contenait un noeud (out vertex) et une collection (vector) de transitions possibles + * DFSStackElement contient maintenant une collection (vector) de paires (séquence de noeud parcourus pendant la transition, transition possible) + * (stack, transition), ainsi qu'un itérateur sur cette liste. + * stack est le chemin dans le graphe (commençant par nextVertex) correspondant à l'exécution de la transition. + * + * Attention aux paramètres begin,end de la fonction checkMultiTerms + * La fonction checkMultiTerms a été écrite pour avec les limitations suivantes: sens forward seulement, pas de + * prise en compte de multiples arêtes à partir d'un noeud. + * + */ bool Automaton::DFSStack:: push(const LinguisticGraphVertex& vertex, const Tstate& state, - AnalysisContent& analysis) { + AnalysisContent& analysis, + const LinguisticGraphVertex& limit) { /* AULOGINIT; LDEBUG << "Automaton:DFSSTack: pushing " << vertex @@ -425,12 +518,13 @@ push(const LinguisticGraphVertex& vertex, LinguisticGraphVertex nextVertex; while (m_searchGraph->getNextVertex(m_graph.getGraph(),nextVertex)) { if (! isEndVertex(nextVertex)) { - std::vector matchingTransitions(0); + std::vector matchingTransitions(0); // LDEBUG << "Automaton:get matching transitions from state " // << state << " for vertex " << nextVertex; if (m_automaton. getMatchingTransitions(m_graph,nextVertex,analysis, - state,matchingTransitions)) { + m_searchGraph,state,matchingTransitions,limit)) { + /* if (logger.isDebugEnabled()) { ostringstream oss; std::vector::const_iterator @@ -442,7 +536,7 @@ push(const LinguisticGraphVertex& vertex, } LDEBUG << oss.str(); }*/ - tmpStack.push_back(DFSStackElement(nextVertex,matchingTransitions)); + tmpStack.push_back(DFSStackElement(matchingTransitions)); } /* else { LDEBUG << "Automaton:DFSSTack: => no matching transitions" @@ -526,7 +620,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, &forward, limit); success = testFromState(initialState, graph, - begin, analysis, + begin, limit, analysis, results, checkList, forwardSearchStack, @@ -541,7 +635,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, &backward, limit); success = testFromState(initialState, graph, - begin, analysis, + begin, limit, analysis, results, checkList, backwardSearchStack, @@ -557,6 +651,7 @@ getAllMatches(const LinguisticAnalysisStructure::AnalysisGraph& graph, bool Automaton::testFromState(const Tstate firstState, const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& beginVertex, + const LinguisticGraphVertex& limitVertex, AnalysisContent& analysis, AutomatonMatchSet& results, ConstraintCheckList& checkList, @@ -569,7 +664,7 @@ bool Automaton::testFromState(const Tstate firstState, // store in stack pairs of (automaton transition/graph vertex) // (store combinatory of all possible pairs, but if store only // matching pairs, problems with ConstraintCheckList - + RecognizerMatch currentMatch(&graph); // check initial state @@ -582,16 +677,18 @@ bool Automaton::testFromState(const Tstate firstState, return (!results.empty()); } - // begin is the vertex that matched the trigger: - // push following vertices + // beginVertex is the vertex that matched the trigger + // initialize the stack with pairs (stack of vertex with nextV as first element,matchingTransition) + // nextV is one of the successor nodes in the graph and matchingTransition(nextV) succeeds // LDEBUG << "pushing"; - S.push(beginVertex,firstState,analysis); + S.push(beginVertex,firstState,analysis,limitVertex); LinguisticGraphVertex vertex; const Transition* transition(0); uint64_t nbIter(0); bool backtrack(false); + // contexte de backtrack vector backtrackDepth; backtrackDepth.push_back(0); @@ -602,18 +699,19 @@ bool Automaton::testFromState(const Tstate firstState, // LDEBUG << "in iteration " << nbIter; if (S.size() > controlParams.getMaxDepthStack()) { AULOGINIT; - LWARN << "MaxDepthStack exceeded in automaton search: ignore rest of search" - ; + LWARN << "MaxDepthStack exceeded in automaton search: ignore rest of search"; return (!results.empty()); } if (nbIter > controlParams.getMaxTransitionsExplored()) { AULOGINIT; - LWARN << "MaxTransitionsExplored exceeded in automaton search: ignore rest of search" - ; + LWARN << "MaxTransitionsExplored exceeded in automaton search: ignore rest of search"; return (!results.empty()); } - boost::tie(vertex,transition)=S.top(); + // boost::tie(vertex,transition)=S.top(); + DFFSPos const & dffsPos = S.top(); + vertex = dffsPos.first.front(); + transition = dffsPos.second; if (backtrack) { // in backtrack : pop_back current match until the vertex // for which we are testing a new matching transition @@ -658,12 +756,17 @@ bool Automaton::testFromState(const Tstate firstState, // } //if (trans->match(graph,vertex,analysis,checkList)) { + // TODO: call checkConstraints for every vertex in the deque? if (trans->checkConstraints(graph,vertex,analysis,checkList)) { // LDEBUG << "Automaton: -> match found"; // update current match LimaString transId = LimaString::fromUtf8( trans->getId().c_str() ); - currentMatch.addBackVertex(vertex,trans->keep(), transId); + // OME: call for the complete stack currentMatch.addBackVertex(vertex,trans->keep(), transId); + std::deque::const_iterator vIt = dffsPos.first.begin(); + for( ; vIt != dffsPos.first.end() ; vIt++ ) { + currentMatch.addBackVertex(*vIt,trans->keep(), transId); + } /* LDEBUG << "Automaton: -> vertex (" << vertex << ",keep=" << trans->keep() << ") added in result, currentMatch=" @@ -717,7 +820,7 @@ bool Automaton::testFromState(const Tstate firstState, } // push next vertices - if (!S.push(vertex,nextState,analysis)) { + if (!S.push(vertex,nextState,analysis,limitVertex)) { backtrack=true; } } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h index fc24f82ff..a6e12e728 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automaton.h @@ -35,6 +35,9 @@ #include "AutomatonExport.h" #include "transitionUnit.h" #include "searchGraph.h" +#ifdef ANTINNO_SPECIFIC +#include "gazeteerTransition.h" +#endif #include "transition.h" #include "transitionSearchStructure.h" #include "recognizerMatch.h" @@ -341,9 +344,10 @@ friend class AutomatonWriter; bool getMatchingTransitions(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& vertex, AnalysisContent& analysis, + SearchGraph* searchGraph, const Tstate& state, - std::vector& - matchingTransitions) const; + std::vector,const Transition*> >& matchingTransitions, + const LinguisticGraphVertex& limit) const; protected: Tstate m_numberStates; /**< number of states in the automaton */ @@ -366,6 +370,7 @@ friend class AutomatonWriter; bool testFromState(const Tstate firstState, const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& limit, AnalysisContent& analysis, AutomatonMatchSet& results, ConstraintCheckList& checkList, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp index 9d1fb9a95..7e70908c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.cpp @@ -59,6 +59,37 @@ void writeTword(std::ofstream& file,const Tword& s,const FsaStringsPool& sp) Misc::writeUTF8StringField(file,sp[s]); } +// LimaString type +void readLimaString(std::ifstream& file, LimaString& s) +{ + Misc::readUTF8StringField(file,s); +} +void writeLimaString(std::ofstream& file,const LimaString& s) +{ + Misc::writeUTF8StringField(file,s); +} + +// wordSet = set of multi-term +void readWordVector(std::ifstream& file, std::vector& wordVector) +{ + int i = Misc::readCodedInt(file); + for( ; i > 0 ; i-- ) { + LimaString s; + Misc::readUTF8StringField(file,s); + wordVector.push_back(s); + } +} + +void writeWordSet(std::ofstream& file,const std::set& wordSet) +{ + int i = wordSet.size(); + Misc::writeCodedInt(file,i); + std::set::const_iterator wordIt = wordSet.begin(); + for( ; wordIt != wordSet.end() ; wordIt++ ) { + Misc::writeUTF8StringField(file,*wordIt); + } +} + //---------------------------------------------------------------------- // Part-of-speech type void readTpos(std::ifstream& file, Tpos& p) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h index d1d20f438..7abf6f758 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonCommon.h @@ -42,6 +42,8 @@ #include #include #include +#include +#include namespace Lima { namespace LinguisticProcessing { @@ -70,6 +72,10 @@ void writeTword(std::ofstream& file,const Tword& s,const FsaStringsPool& sp); void readTpos(std::ifstream&, Tpos&); void writeTpos(std::ofstream&,const Tpos&); +// reading and writing set of words (for gazeteer) +void readWordVector(std::ifstream& file, std::vector& wordVector); +void writeWordSet(std::ofstream& file,const std::set& wordSet); + //comparing the part-of-speech type with a LingPropertyEntry // should take a const LingPropertyEntry& argument // check with JYS diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp index 3ffc837c9..7c0fb4e48 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/automatonReaderWriter.cpp @@ -46,6 +46,7 @@ #include "setTransition.h" #include "deaccentuatedTransition.h" #include "entityTransition.h" +#include "entityGroupTransition.h" #include "common/Data/readwritetools.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/TStatus.h" @@ -71,7 +72,12 @@ void writeTypeTransition(std::ofstream& file, const TypeTransition t) { } +#ifdef ANTINNO_SPECIFIC #define RECOGNIZER_VERSION "1.20" +#else +#define RECOGNIZER_VERSION "1.30" +#endif + #define RECOGNIZER_DEBUG_VERSION ".debug" //---------------------------------------------------------------------- @@ -396,6 +402,18 @@ readTransitionUnit(std::ifstream& file,MediaId language) t=new TStatusTransition(status); break; } + case T_GAZETEER: { + // read alias + LimaString alias; + Misc::readUTF8StringField(file,alias); + // read set of words + std::vector wordVector; + readWordVector(file,wordVector); + // read keep + int keepVal = Misc::readCodedInt(file); + // create transition + t=new GazeteerTransition(wordVector, alias, keepVal == 1); + break; } case T_AND: { uint64_t size=Misc::readCodedInt(file); vector tmp(size); @@ -441,6 +459,12 @@ readTransitionUnit(std::ifstream& file,MediaId language) t=new EntityTransition(m_entityTypeMapping[EntityType(typeId,groupId)]); break; } + case T_ENTITY_GROUP: { + EntityGroupId groupId=static_cast(Misc::readCodedInt(file)); + // use entityGroup mapping + t=new EntityGroupTransition(m_entityGroupMapping[groupId]); + break; + } default: { AULOGINIT; LERROR << "Undefined type of transition: " << codeTrans; @@ -460,6 +484,7 @@ readTransitionUnit(std::ifstream& file,MediaId language) char *buf = new char [len]; file.read(buf, len); t->setId(std::string(buf,len)); + delete[] buf; uint64_t n=Misc::readCodedInt(file); Constraint c; for (uint64_t i(0); ipartOfSpeech()); break; } + case T_GAZETEER: { + GazeteerTransition* t=static_cast(transition); + Misc::writeUTF8StringField(file,t->alias()); + writeWordSet(file,t->wordSet()); + if( t->keep() ) + Misc::writeCodedInt(file,1); + else + Misc::writeCodedInt(file,0); + break; + } case T_NUM: { NumericTransition* t=static_cast(transition); Misc::writeCodedInt(file,t->value()); @@ -725,6 +760,12 @@ writeTransitionUnit(std::ofstream& file, file.write((char*) &lang,sizeof(unsigned char)); break; } + case T_ENTITY_GROUP: { + EntityGroupTransition* t=static_cast(transition); + EntityGroupId entityGroupId=t->entityGroupId(); + Misc::writeCodedInt(file,entityGroupId); + break; + } case T_ENTITY: { EntityTransition* t=static_cast(transition); EntityType entityType=t->entityType(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp new file mode 100644 index 000000000..46a36faa3 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.cpp @@ -0,0 +1,130 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************* + * + * @file entityGroupTransition.cpp + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Mon oct 5 2015 + * copyright (c) 2006-2015 by CEA + * + *************************************************************************/ + + +#include "entityGroupTransition.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +/***********************************************************************/ +// initialization of static members +LimaString EntityGroupTransition::m_entityAnnotation=Common::Misc::utf8stdstring2limastring("SpecificEntity"); + +/***********************************************************************/ +// constructors +/***********************************************************************/ +EntityGroupTransition::EntityGroupTransition(): +TransitionUnit(), +m_entityGroupId() +{ +} + +EntityGroupTransition::EntityGroupTransition(Common::MediaticData::EntityGroupId groupId, bool keep): +TransitionUnit(keep), +m_entityGroupId(groupId) +{ +} + +EntityGroupTransition::~EntityGroupTransition() {} + +std::string EntityGroupTransition::printValue() const { + ostringstream oss; + oss << "ENTITY_GROUP_" << m_entityGroupId; + return oss.str(); +} + +/***********************************************************************/ +// operators == +/***********************************************************************/ +bool EntityGroupTransition::operator== (const TransitionUnit& tright) const { + if ( (type() == tright.type()) + && (m_entityGroupId == static_cast(tright).entityGroupId()) + ) { + return true; + } + else { + return false; + } +} + +bool EntityGroupTransition:: +compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* /*token*/, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + // should compare to vertex ? + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) { + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: no annotation graph available !"; + return false; + } + + // find annotationGraphVertex matching the vertex of the current graph + std::set matches = annotationData->matches(graph.getGraphId(), v, "annot"); + if (matches.empty()) + { + AULOGINIT; + LDEBUG << "annotation ("<hasAnnotation(annotVertex, m_entityAnnotation)) + { + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: No " << m_entityAnnotation << " annotation available on " << v; + return false; + } + + const SpecificEntityAnnotation* se = + annotationData->annotation(annotVertex, m_entityAnnotation). + pointerValue(); + Common::MediaticData::EntityType type = se->getType(); + AULOGINIT; + LDEBUG << "EntityGroupTransition::compare: type = " << type << ", groupId = " << type.getGroupId(); + LDEBUG << "EntityGroupTransition::compare: m_entityGroupId = " << m_entityGroupId; + LDEBUG << "EntityGroupTransition::compare: tests m_entityGroupId == type.getGroupId() = " << (m_entityGroupId == type.getGroupId()); + return( m_entityGroupId == type.getGroupId() ); +} + +} // namespace end +} // namespace end +} // namespace end diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h new file mode 100644 index 000000000..170105cc4 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/entityGroupTransition.h @@ -0,0 +1,86 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file EntityGroupTransition.h + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Mon oct 5 2015 + * copyright (c) 2006-2015 by CEA + * Project Automaton + * + * @brief transitions that are previously recognized entities + * + ***********************************************************************/ + +#ifndef ENTITYGROUPTRANSITION_H +#define ENTITYGROUPTRANSITION_H + +#include "AutomatonExport.h" +#include "automatonCommon.h" +#include "transitionUnit.h" + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +class LIMA_AUTOMATON_EXPORT EntityGroupTransition : public TransitionUnit +{ + public: + EntityGroupTransition(); + EntityGroupTransition(Common::MediaticData::EntityGroupId, bool keep=true); + virtual ~EntityGroupTransition(); + + EntityGroupTransition* clone() const; + EntityGroupTransition* create() const; + + std::string printValue() const; + bool operator== (const TransitionUnit&) const; + + bool compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data) const; + + TypeTransition type() const; + Common::MediaticData::EntityGroupId entityGroupId() const { return m_entityGroupId; } + void setEntityGroupId(Common::MediaticData::EntityGroupId groupId) { m_entityGroupId=groupId; } + + private: + Common::MediaticData::EntityGroupId m_entityGroupId; + static LimaString m_entityAnnotation; +}; + + +/***********************************************************************/ +// inline access functions +/***********************************************************************/ +inline TypeTransition EntityGroupTransition::type() const { return T_ENTITY_GROUP; } + +inline EntityGroupTransition* EntityGroupTransition::clone() const { + return new EntityGroupTransition(*this); } +inline EntityGroupTransition* EntityGroupTransition::create() const { + return new EntityGroupTransition(); } + + +} // namespace end +} // namespace end +} // namespace end + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp new file mode 100644 index 000000000..1c777c887 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.cpp @@ -0,0 +1,376 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************* +* +* File : gazeteerTransition.cpp +* Author : Olivier Mesnard (olivier.mesnard@cea.fr) +* @date Thu August 04 2015 +* copyright Copyright (C) 2002-2015 by CEA LIST +* Version : $Id$ +* +*************************************************************************/ + + +#include "gazeteerTransition.h" +#include "common/MediaticData/mediaticData.h" +#include +#include // for tie +#include "searchGraph.h" + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::MediaticData; + + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +/***********************************************************************/ +// constructors +/***********************************************************************/ +GazeteerTransition::GazeteerTransition(): +TransitionUnit(), +m_wordSet(), +m_alias() +{ +} + +GazeteerTransition::GazeteerTransition(const std::vector& wordSet, const LimaString& alias, bool keep): +TransitionUnit(keep), +m_wordSet(wordSet.begin(),wordSet.end()), +m_alias(alias) +{ +} + +GazeteerTransition::GazeteerTransition(const GazeteerTransition& t): +TransitionUnit(t), +m_wordSet(t.m_wordSet), +m_alias(t.m_alias) +{ +// TODO ToBeDeleted ? + // copyProperties(t); +} + +GazeteerTransition::~GazeteerTransition() {} + +GazeteerTransition& GazeteerTransition::operator = (const GazeteerTransition& t) { + if (this != &t) { + m_alias = t.alias(); + copyProperties(t); + } + return *this; +} + + +std::string GazeteerTransition::printValue() const { + ostringstream oss; + oss << "alias:" << Lima::Common::Misc::limastring2utf8stdstring(m_alias); + std::set::const_iterator it = m_wordSet.begin(); + if( it != m_wordSet.end() ) { + const Lima::LimaString & word = *it; + oss << "(" << Lima::Common::Misc::limastring2utf8stdstring(word); + } + for( it++ ; it != m_wordSet.end(); it++ ) { + const Lima::LimaString & word = *it; + oss << "," << Lima::Common::Misc::limastring2utf8stdstring(word); + } + oss << ")"; + return oss.str(); +} + +/***********************************************************************/ +// operators == +/***********************************************************************/ +bool GazeteerTransition::operator== (const TransitionUnit& tright) const { + if ( (type() == tright.type()) + && (m_alias == static_cast(tright).alias()) + ) { + return compareProperties(tright); + } + else { + return false; + } +} + +bool GazeteerTransition:: +compare(const LinguisticAnalysisStructure::AnalysisGraph& /*graph*/, + const LinguisticGraphVertex& /*vertex*/, + AnalysisContent& /*analysis*/, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + //AULOGINIT; +// LDEBUG << "GazeteerTransition compare " << Common::MediaticData::MediaticData::changeable().stringsPool()[token->form()] << " and " << Common::MediaticData::MediaticData::changeable().stringsPool()[m_word]; + QString form(token->stringForm()); + std::set::const_iterator it = m_wordSet.lower_bound(form); + if( it == m_wordSet.end() ) { + return false; + } + QString element = *it; + // If element is equal to form + if( element == form ) + { + return true; + } + // Or element begin with form followed by a space character + if( element.startsWith(form) ) + { + if( element.at(form.length()) == ' ') + { + return true; + } + } + /* + QString pattern(form); + pattern.append("\\b"); + QRegExp rx(pattern); + int index = qStringList.indexOf(rx); + */ +// return true; + return false; +} + +bool GazeteerTransition:: +matchPath(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + deque& vertices, + const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const +{ + // TODO: use of limit??? +#ifdef DEBUG_LP + AULOGINIT; +#endif + const LimaString firstSimpleTerm = token->stringForm(); + /* build multi term list in gazeteer with firstSimpleTerm as first term */ + std::vector > additionalMultiTermList; + buildNextTermsList( firstSimpleTerm, additionalMultiTermList ); + /* follow graph if tokens match other terms */ + std::stack,std::vector > > triggerMatches; + checkMultiTerms(graph, vertex, limit, searchGraph, analysis, additionalMultiTermList, triggerMatches ); + if( triggerMatches.empty() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::match: trans of type gazeteerTransition selected but no match"; +#endif + return false; + } + else { + vertices = triggerMatches.top(); + return true; + } + return false; +} + + /* Gazeteer may contains multi-term elements like */ +/* "managing director","Managing Director","managing editor","managing comitee secretary"... */ +/* From wordSet, we build a list of multiple terms, each with parameter firstSimpleTerm as first simple term */ +/* [("managing,director");("managing,Director");("managing,editor");("managing,comitee,secretary")] */ +/* return false if there is no elements begining with "managing" */ +bool GazeteerTransition:: +buildNextTermsList( const LimaString& firstSimpleTerm, std::vector >& multiTermList ) const +{ +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "GazeteerTransition::buildNextTermsList(" << firstSimpleTerm << ")"; +#endif + + // Fill list of list of additional simple terms from list of elements + std::set::const_iterator it = m_wordSet.lower_bound(firstSimpleTerm); + if( it == m_wordSet.end() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: Error: first term not found"; +#endif + return false; + } + for( ; it != m_wordSet.end() ; it++ ) + { + LimaString element = *it; +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: Examining " << element.toStdString(); +#endif + // if element does not start with firstSimpleTerm, there no more possible match + if( !element.startsWith(firstSimpleTerm) ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: stop it!: first term not found"; +#endif + break; + } + std::vector multiTerm; + // if element equals the token, we push a vector with a unique element, and go to the next element + if( element == firstSimpleTerm ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back in multiTermList singleton " << firstSimpleTerm.toStdString(); +#endif + multiTerm.push_back(firstSimpleTerm); + multiTermList.push_back(multiTerm); + continue; + } + // within element, if firstSimpleTerm is not followed by others simple terms separated with space + // first term is only a prefix and does not match exactly firstSimpleTerm, go to the next element + int pos(0); + int index = element.indexOf(' ', pos); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: pos = " << pos << ", index=" << index; +#endif + if( index != firstSimpleTerm.length() ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: no second term for " << element.toStdString(); +#endif + continue; + } + else { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back in multiterm " << firstSimpleTerm.toStdString(); +#endif + multiTerm.push_back(firstSimpleTerm); + } + // build list of elements following firstSimpleTerm + for( ; ; ) { + pos = index+1; + index = element.indexOf(' ', pos); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: pos = " << pos << ", index=" << index; +#endif + if( index == -1 ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back last term " << element.mid(pos).toStdString(); +#endif + multiTerm.push_back(element.mid(pos)); + break; + } + else + { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: add term " << element.mid(pos,index-pos).toStdString(); +#endif + multiTerm.push_back(element.mid(pos,index-pos)); + } + } +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::buildNextTermsList: push back list of " << multiTerm.size() << " elements"; +#endif + multiTermList.push_back(multiTerm); + } + return( multiTermList.size() > 0 ); +} + +bool GazeteerTransition:: +checkMultiTerms( const AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& limit, + Lima::LinguisticProcessing::Automaton::SearchGraph* searchGraph, + Lima::AnalysisContent& analysis, const vector< vector< Lima::LimaString > >& additionalMultiTermList, + stack< deque< LinguisticGraphVertex >, vector< deque< LinguisticGraphVertex > > >& matches + ) const { + + +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "GazeteerTransition::checkMultiTerms( from " << position << ")"; +#endif + // Iteration on multi-terms from gazeteer whose first term matches current token + std::vector >::const_iterator multiTermsIt = additionalMultiTermList.begin(); + const LinguisticGraph* lGraph = graph.getGraph(); + for( ; multiTermsIt != additionalMultiTermList.end() ; multiTermsIt++ ) { + // iterator for simpleterms + std::vector::const_iterator termsIt = (*multiTermsIt).begin(); + std::vector::const_iterator termsIt_end = (*multiTermsIt).end(); +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: check multi-term (" + << *termsIt << " and " << (*multiTermsIt).size()-1 << " more...)"; +#endif + // For each list of simple Terms, we make a deep first search in the graph + // searchPos stores a stack of position in the graph to perform the deep first search + // the completed path is stored in a deque of vertices (initialized with position) + std::deque triggerMatch; + triggerMatch.push_back(position); + termsIt++; + // init search from position + searchGraph->findNextVertices(lGraph, position); + // init current position + LinguisticGraphVertex nextVertex = position; + // if list is not exhausted + + // case of empty list of simple term + if(termsIt == termsIt_end ) { + // Error! +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: list of simple terms is a singleton!"; +#endif + matches.push(triggerMatch); + //matches.push(triggerMatch); + } + else { + // go one step ahead from curentPosition if possible + while ( searchGraph->getNextVertex(lGraph, nextVertex )) { + const LinguisticGraphVertex& firstVertex = graph.firstVertex(), + lastVertex = graph.lastVertex(); + if (nextVertex == lastVertex || nextVertex == firstVertex) +// return false; + break; +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: progress one step forward, nextVertex=" << nextVertex; + LDEBUG << "GazeteerTransition::checkMultiTerms: test " << *termsIt; +#endif + // test currentVertex + Token* token = get(vertex_token, *lGraph, nextVertex); + LimaString form(token->stringForm()); + if( form == *termsIt ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: match with " << *termsIt; +#endif + // If match, push vertex in triggerMatch and initialize next step + // Push out_edge is a better if we have to follow the path from the begining ??? + triggerMatch.push_back(nextVertex); + // stack next step to continue the search + searchGraph->findNextVertices(lGraph, nextVertex); + termsIt++; + if(termsIt == termsIt_end ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: list of simple terms exhausted!"; +#endif + // list of Simple term exhausted: success + // we push the path in the aGraph as a solution of triggerMatch + // Only if size of solution is greater than previous one !! + if( matches.empty() || (triggerMatch.size() > matches.top().size()) ) { +#ifdef DEBUG_LP + LDEBUG << "GazeteerTransition::checkMultiTerms: push (in matches) a deque of size " << triggerMatch.size(); +#endif + matches.push(triggerMatch); + } + // no need to go forward + break; + } + // else we do not stack next steps, we obtain a cut + } + } + } + } + if( matches.empty() ) + return false; + return true; +} + +} // namespace end +} // namespace end +} // namespace end diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h new file mode 100644 index 000000000..9612c05bf --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/gazeteerTransition.h @@ -0,0 +1,112 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file gazeteerTransition.h + * @author Olivier Mesnard (olivier.mesnard@cea.fr) + * @date Thu August 04 2015 + * copyright Copyright (C) 2002-2015 by CEA LIST + * Project Automaton + * + * @brief transitions that are surface form : belongs to a gazeteer + * + ***********************************************************************/ + +#ifndef GAZETEERTRANSITION_H +#define GAZETEERTRANSITION_H + +#include "AutomatonExport.h" +#include "automatonCommon.h" +#include "transitionUnit.h" +#include +#include "searchGraph.h" + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +class LIMA_AUTOMATON_EXPORT GazeteerTransition : public TransitionUnit +{ + public: + GazeteerTransition(); + GazeteerTransition(const std::vector& wordSet, const LimaString& alias, bool keep=true); + GazeteerTransition(const GazeteerTransition&); + virtual ~GazeteerTransition(); + GazeteerTransition& operator = (const GazeteerTransition&); + + GazeteerTransition* clone() const; + GazeteerTransition* create() const; + + std::string printValue() const; + bool operator== (const TransitionUnit&) const; + + bool compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data) const; + + TypeTransition type() const; + LimaString alias() const; + const std::set& wordSet() const; + + bool matchPath(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + std::deque& vertices, + const LinguisticAnalysisStructure::MorphoSyntacticData* ) const; + + private: + bool checkMultiTerms( const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const std::vector >& additionalMultiTermList, + std::stack,std::vector > >& matches + ) const; + + bool buildNextTermsList( const LimaString& firstSimpleTerm, std::vector >& multiTermList ) const; + std::set m_wordSet; + LimaString m_alias; + +}; + + +/***********************************************************************/ +// inline access functions +/***********************************************************************/ +inline const std::set& GazeteerTransition::wordSet() const { return m_wordSet; } +inline TypeTransition GazeteerTransition::type() const { return T_GAZETEER; } + +inline GazeteerTransition* GazeteerTransition::clone() const { + return new GazeteerTransition(*this); } +inline GazeteerTransition* GazeteerTransition::create() const { + return new GazeteerTransition(); } +inline LimaString GazeteerTransition::alias() const { return m_alias; } + + +} // namespace end +} // namespace end +} // namespace end + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp index 5d6352f68..58bb4624c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizer.cpp @@ -1,1075 +1,1142 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/************************************************************************ -* -* File : recognizer.cpp -* Author : Romaric Besancon (besanconr@zoe.cea.fr) -* Created on : Tue Oct 15 2002 -* Copyright : (c) 2002 by CEA -* -************************************************************************/ - -#include "recognizer.h" - -#include "transitionSearchStructure.h" -#include "automatonCommon.h" -#include "transitionUnit.h" -#include "recognizerData.h" -#include "common/Data/LimaString.h" -#include "common/MediaticData/EntityType.h" -#include "common/MediaticData/mediaticData.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include -#include -#include -#include -#include -#include -#include - -using namespace std; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::ApplyRecognizer; - -namespace Lima { -namespace LinguisticProcessing { -namespace Automaton { - -// a comparison operator on Rule pointer: -// to sort SetOfRules on decreasing rule weights -class CompareRulePtr { -public: - bool operator()(Rule* r1,Rule* r2) { - return (r1->getWeight() > r2->getWeight()); - } -}; - - -// a comparison operator on TriggerRule -class Recognizer::CompareTriggerRule { -public: - bool operator()(const Recognizer::TriggerRule* r1, - const Recognizer::TriggerRule* r2) { - return (r1->setOfRules().front()->getWeight() > - r2->setOfRules().front()->getWeight()); - } -}; - - -/** recognizer factory */ -SimpleFactory recognizerFactory(RECOGNIZER_CLASSID); - -//********************************************************************** -// constructors -//********************************************************************** -Recognizer::Recognizer(): - AbstractResource(), - m_rules(0), - m_ruleStorage(0), - m_language(), - m_automatonControlParams(), - m_filename(), - m_searchStructure() -{ } - -// copy is complex because of the pointers -Recognizer::Recognizer(const Recognizer& r): -AbstractResource(r) -{ - init(); - copy(r); - - // have to initialize the search structure of the new recognizer - initializeSearchStructure(); -} - -//********************************************************************** -// destructor -//********************************************************************** -Recognizer::~Recognizer() -{ - freeMem(); - clearSearchStructure(); -} - -//********************************************************************** -// copy -//********************************************************************** -Recognizer& Recognizer::operator = (const Recognizer& r) -{ - if (this != &r) - { - freeMem(); - init(); - copy(r); - } - - // do not copy the search structure : recompute it the new recognizer - // (not sure the copy is less complex than recomputing it) - initializeSearchStructure(); - - return (*this); -} - -void Recognizer::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - - /** @addtogroup ResourceConfiguration - * - <group name="..." class="AutomatonRecognizer"> - * -  rules : file containing the compiled rules of the recognizer - * -  maxDepthStack : maximum size of stack in depth-first-search - * when testing a rule (default is 100) - * -  maxTransitionsExplored : max number of transitions explored - * when testing a rule (default is 1000) - * -  maxNbResults : max number of results temporarily stored - * when testing a rule (default is 50) - * -  maxResultSize : max size of a result for a rule match - * (this parameter can be seen as the effective size of - * "n" when using {0-n} in a rule) (default is 200) - */ - - m_language=manager->getInitializationParameters().language; - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - try - { - string rulesFile = unitConfiguration.getParamsValueAtKey("rules"); - if (rulesFile != "") - { - m_filename=rulesFile; - rulesFile = resourcesPath + "/" + rulesFile; -// LDEBUG << "read recognizer from file : " << rulesFile; - //readFromFile(rulesFile); - AutomatonReader reader; - reader.readRecognizer(rulesFile,*this); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - AULOGINIT; - LERROR << "No param 'rules' in recognizer group for language " << (int)m_language; - throw InvalidConfiguration(); - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxDepthStack"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxDepthStack is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxDepthStack(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxTransitionsExplored"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxTransitionsExplored is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxTransitionsExplored(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxNbResults"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxNbResults is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxNbResults(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - try - { - string str=unitConfiguration.getParamsValueAtKey("maxResultSize"); - uint64_t val=atol(str.c_str()); - if (val==0) { - AULOGINIT; - LWARN << "maxResultSize is 0: keep default value"; - } - else { - m_automatonControlParams.setMaxResultSize(val); - } - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) { - // keep default value - } - - Common::MediaticData::MediaticData::changeable().stringsPool(m_language).endResourcesPool(); -} - -//********************************************************************** -// helper functions for constructors and destructors -//********************************************************************** -void Recognizer::init() -{ - m_rules.clear(); - m_ruleStorage.clear(); - m_language=UNDEFLANG; - m_automatonControlParams=AutomatonControlParams(); -} - -void Recognizer::copy(const Recognizer& r) -{ - map pointersMap; - - for (uint64_t i(0); iclone(); - m_rules.push_back(TriggerRule(t,SetOfRules(0))); - for (uint64_t j(0); j results; - if (testSetOfRules(*(m_rules[offset].first), - m_rules[offset].second, - graph, - current, - graph.firstVertex(), - graph.lastVertex(), - analysis, - results)) - { - result=results.front(); // only one result because stopAtFirstSuccess=true - return true; - } - return false; -} -*/ - -//********************************************************************** -// test a set of rules for a trigger -uint64_t Recognizer::testSetOfRules(const TransitionUnit& trigger, - const SetOfRules& rules, - const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& position, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - vector& matches, - std::set* forbiddenTypes, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool applySameRuleWhileSuccess) const { - RecognizerMatch leftmatch(&graph); - RecognizerMatch rightmatch(&graph); - - if (onlyOneSuccessPerType && forbiddenTypes==0) { - AULOGINIT; - LERROR << "cannot use onlyOneSuccessPerType " - << "when forbidden types are not allowed"; - onlyOneSuccessPerType=false; - } - - - uint64_t nbSuccess(0); - - // left context is same LinguisticAnalysisStructure::AnalysisGraph as current (current is in fact - // between the current token and the previous one) - LinguisticGraphVertex left=position; - LinguisticGraphVertex right=position; - //LinguisticGraphVertex right=position.forward(); - -#ifdef DEBUG_LP - AULOGINIT; - LDEBUG << "testing set of rules triggered by " << trigger << " on vertex " << position; - LDEBUG << "onlyOneSuccessPerType=" << onlyOneSuccessPerType; - if (logger.isDebugEnabled()) { - std::ostringstream oss; - for (SetOfRules::const_iterator it=rules.begin(),it_end=rules.end();it!=it_end;it++) { - oss << " - " << (*it)->getWeight(); - } - LDEBUG << "Rule weights" << oss.str(); - } -#endif - - bool reapplySameRule(false); - - SetOfRules::const_iterator - rule=rules.begin(), - rule_end=rules.end(); - for (; rule!=rule_end; rule++) { - Rule* currentRule=*rule; - -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LDEBUG << "testing rule "<<*currentRule << "," << currentRule->getRuleId() <<" of type " - << currentRule->getType() << ",reapply=" - << reapplySameRule << " from " << position; - } -#endif - - if (forbiddenTypes && - forbiddenTypes->find(currentRule->getType()) - != forbiddenTypes->end()) { - // type previously forbidden by a negative rule -/* LDEBUG << "type " << currentRule->getType() - << " is forbidden: continue";*/ - continue; - } - - // initializes the constraint checklist - ConstraintCheckList - constraintCheckList(currentRule->numberOfConstraints(), - ConstraintCheckListElement(graph)); - - // treat the constraints for the trigger with the constraint - // checklist corresponding to this rule - //Token* token=get(vertex_token,*(graph.getGraph()),position); -// LDEBUG << "Recognizer: checking trigger constraints: "; - - if (!trigger.checkConstraints(graph,position,analysis, - constraintCheckList)) { - // one unary constraint was not verified -// LDEBUG << "one unary constraint on trigger not verified"; - - // apply actions (for actions triggered by failure) - if (!currentRule->negative()) { - currentRule->executeActions(graph, analysis, - constraintCheckList, - false, - 0); // match is not used -// LDEBUG << "actionSuccess=" << actionSuccess; - } - continue; - } - - leftmatch.reinit(); - rightmatch.reinit(); - ForwardSearch forward; - BackwardSearch backward; - bool success = currentRule->test(graph, left, right, - begin, end, analysis, - leftmatch, rightmatch, - constraintCheckList,forward,backward, - m_automatonControlParams); - //LDEBUG << "success=" << success; - - RecognizerMatch* match=0; - - if (success) { - // build complete match - - match=new RecognizerMatch(leftmatch); - match->addBackVertex(position,trigger.keep(), "trigger"); - match->addBack(rightmatch); - // remove elements not kept at begin and end of the expression - match->removeUnkeptAtExtremity(); - - // check if trigger is head - if (trigger.head()) { - match->setHead(position); - } - match->setType(currentRule->getType()); - match->setLinguisticProperties(currentRule->getLinguisticProperties()); - match->setContextual(currentRule->contextual()); - setNormalizedForm(currentRule->getNormalizedForm(),*match); - } - - // execute possible actions associated to the rule iff current rule is - // positive - //LDEBUG << "Recognizer: executing actions: "; - bool actionSuccess = true; - if (!currentRule->negative()) { - // std::cerr << "execute rule " << currentRule->getRuleId() << " of type " - // << currentRule->getType() << " on vertex " << position << std::endl; - actionSuccess = currentRule->executeActions(graph, analysis, - constraintCheckList, - success, - match); - //LDEBUG << "actionSuccess=" << actionSuccess; - } - -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LinguisticGraphVertex v=position; - LimaString str(""); - Token* token=get(vertex_token,*(graph.getGraph()),position); - if (token!=0) { - str = token->stringForm(); - } - if (success) { - LDEBUG << "trigger " << v << "[" << str << "]:rule " - << currentRule->getRuleId() << "-> success=" << success - << ",actionSuccess=" << actionSuccess; - LDEBUG << " matched:" << match->getNormalizedString(Common::MediaticData::MediaticData::single().stringsPool(m_language)); - } - else { - LDEBUG << "vertex " << v << "[" << str << "]:rule " - << currentRule->getRuleId() << "-> success= false"; - } - } -#endif - - if (success && actionSuccess) { - if (forbiddenTypes && currentRule->negative()) { - forbiddenTypes->insert(currentRule->getType()); - success = false; - delete match; - match=0; - continue; - } - - RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); - if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { - matches.push_back(*match); - delete match; // a copy has been made - match=0; -#ifdef DEBUG_LP - if (logger.isDebugEnabled()) { - LDEBUG << "Returning from testSetOfRules cause stopAtFirstSuccess (" - << stopAtFirstSuccess << ") or next vertices empty (" - << (recoData->getNextVertices().empty()) - << ")"; - } -#endif - return 1; - } - else { - if (applySameRuleWhileSuccess) { - if (reapplySameRule) { - if (*match==matches.back()) { -// AULOGINIT; -// LDEBUG << "Reapplication of same rule gives same result: " -// << "abort to avoid inifinite loop: " -// << *match << ";" << matches.back(); - delete match; // a copy has been made - match=0; - reapplySameRule=false; - continue; - } -/* else { - LDEBUG << "Reapplication of same rule gives new result"; - }*/ - } - // reapply same rule - rule--; - reapplySameRule=true; - } - -// LDEBUG << "add match to results " << *match; - matches.push_back(*match); - delete match; // a copy has been made - match=0; - - if (onlyOneSuccessPerType) { -/* LDEBUG << "add " << currentRule->getType() - << " in forbiddenTypes";*/ - forbiddenTypes->insert(currentRule->getType()); - } - nbSuccess++; - } - } - else { -// LDEBUG << "-> no success"; - reapplySameRule=false; - } - - if (match !=0) { - delete match; - } - } - - return nbSuccess; -} - -//********************************************************************** -// normalization function -//********************************************************************** -void Recognizer:: -setNormalizedForm(const LimaString& norm, - RecognizerMatch& match) const -{ - match.features().clear(); - - const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); - if (norm.isEmpty()) { - // use surface form of the expression as normalized form - match.features().setFeature(DEFAULT_ATTRIBUTE,match.getNormalizedString(sp)); - } - else { - match.features().setFeature(DEFAULT_ATTRIBUTE,norm); - } -} - -//********************************************************************** -// main functions that applies the recognizer on a graph -//********************************************************************** - -// Apply between two nodes and search between the same ones -uint64_t Recognizer:: - apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - std::vector& result, - bool testAllVertices, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool returnAtFirstSuccess, - bool applySameRuleWhileSuccess) const -{ - return apply(graph, - begin, - end, - begin, - end, - analysis, - result, - testAllVertices, - stopAtFirstSuccess, - onlyOneSuccessPerType, - returnAtFirstSuccess, - applySameRuleWhileSuccess); -} - -// Apply between two nodes and search between two others. -// precondition [begin, end] included in [upstreamBound,downstreamBound] -uint64_t Recognizer:: - apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - const LinguisticGraphVertex& upstreamBound, - const LinguisticGraphVertex& downstreamBound, - AnalysisContent& analysis, - std::vector& result, - bool testAllVertices, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool returnAtFirstSuccess, - bool applySameRuleWhileSuccess) const -{ - - if (returnAtFirstSuccess) { - stopAtFirstSuccess=true; // implied by the other - } - -#ifdef DEBUG_LP - AULOGINIT; - LDEBUG << "apply recognizer " << m_filename << " from vertex " - << begin << " to vertex " << end; - LDEBUG << " up bound: " << upstreamBound << "; down bound: " << downstreamBound << "; testAllVertices: " << testAllVertices; - LDEBUG << " stopAtFirstSuccess: " << stopAtFirstSuccess << "; onlyOneSuccessPerType: " << onlyOneSuccessPerType; - LDEBUG << " returnAtFirstSuccess: " << returnAtFirstSuccess << "; applySameRuleWhileSuccess: " << applySameRuleWhileSuccess; -#endif - - uint64_t numberOfRecognized(0); - bool success(false); - - // use deque instead of queue to be able to clear() - std::deque toVisit; - std::set visited; - - toVisit.push_back(begin); - // patch for inifinite loop : avoid begin stopped at first step - //visited.insert(begin); - - bool lastReached = false; - while (!toVisit.empty()) - { - LinguisticGraphVertex currentVertex=toVisit.front(); - toVisit.pop_front(); - // patch for inifinite loop : check if we already seen this node - if (visited.find(currentVertex) != visited.end()) - { - continue; - } - - visited.insert(currentVertex); -#ifdef DEBUG_LP - LDEBUG << "to visit size=" << toVisit.size() << " ; currentVertex=" << currentVertex; -#endif - - if (lastReached || // limit given by argument - currentVertex == graph.lastVertex()) { // end of the graph - // LDEBUG << "vertex " << currentVertex << " is last vertex"; - continue; // may be other nodes to test in queue - } - if (currentVertex == end ) { // limit given by argument - lastReached = true; - } - - if (currentVertex != graph.firstVertex()) { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: test on vertex " << currentVertex; -#endif - success = testOnVertex(graph,currentVertex, - upstreamBound,downstreamBound, - analysis,result, - stopAtFirstSuccess, - onlyOneSuccessPerType, - applySameRuleWhileSuccess); - if (success) { - numberOfRecognized++; - if (returnAtFirstSuccess) - return numberOfRecognized; - if (! testAllVertices) { // restart from end of recognized expression -#ifdef DEBUG_LP - LDEBUG << "success: continue from vertex " << currentVertex; -#endif - // GC on 20110803: the clearing below was problematic in case of rules like that: - // []:(t_capital_1st|t_capital){1-3} [,]::LOCATION:N_LOCATION - // which matches text before (left) the trigger which is not included in the match. - // thus the next vertex explored was the newly created one ; the vertex following - // it is already visited (this is in this case the comma) and the content of - // toVisit (the vertex after the trigger) was removed. Thus the search stopped after - // the new vertex. - // Warning: what is the inpact on the use of the testAllVertices parameter ? And is there - // any other side effect ? -// toVisit.clear(); - - } - } - } - - // store following nodes to test - LinguisticGraphOutEdgeIt outEdge,outEdge_end; - boost::tie (outEdge,outEdge_end)=out_edges(currentVertex,*(graph.getGraph())); - - for (; outEdge!=outEdge_end; outEdge++) { - LinguisticGraphVertex next=target(*outEdge,*(graph.getGraph())); - if (visited.find(next)==visited.end()) { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: adding out edge target vertex to the 'to visit' list: " << next; -#endif - toVisit.push_back(next); - // do not put in visited unless it is really visited - // (otherwise, may be suppressed when testAllVertices is false - // and never visited) - //visited.insert(next); - } - else { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: already visited:" << next; -#endif - } - } - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); - std::set& nextVertices = recoData->getNextVertices(); - if (recoData != 0 && !nextVertices.empty() ) - { -#ifdef DEBUG_LP - LDEBUG << "Recognizer: adding next vertices to the 'to visit' list"; -#endif - std::set< LinguisticGraphVertex >::const_iterator nvit, nvit_end; - nvit = nextVertices.begin(); - nvit_end = nextVertices.end(); - for (; nvit != nvit_end; nvit++) - { -#ifdef DEBUG_LP - LDEBUG << " - " << *nvit; -#endif - toVisit.push_front(*nvit); - } - nextVertices.clear(); - } -#ifdef DEBUG_LP - LDEBUG << "Recognizer: 'to visit' list size is now: " << toVisit.size(); -#endif - } - return numberOfRecognized; -} - - -//********************************************************************** -// test the recognizer on a vertex : test -//********************************************************************** -uint64_t Recognizer:: -testOnVertex(const LinguisticAnalysisStructure::AnalysisGraph& graph, - LinguisticGraphVertex& current, - const LinguisticGraphVertex& begin, - const LinguisticGraphVertex& end, - AnalysisContent& analysis, - std::vector& result, - bool stopAtFirstSuccess, - bool onlyOneSuccessPerType, - bool applySameRuleWhileSuccess) const -{ - //AULOGINIT; - Token* token = get(vertex_token, *(graph.getGraph()), current); - MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), current); - - if (token==0) { - AULOGINIT; - LERROR << "no token for vertex " << current; - return 0; - } - - if (data==0) { - AULOGINIT; - LERROR << "no data for vertex " << current; - return 0; - } - - vector matchingRules; - set forbiddenTypes; - uint64_t nbSuccess=0; - - findNextSetOfRules(graph, current, analysis, token, data, matchingRules); - - if (! matchingRules.empty()) { - std::vector::const_iterator - ruleSet=matchingRules.begin(), - ruleSet_end=matchingRules.end(); - for (; ruleSet!=ruleSet_end; ruleSet++) { - uint64_t nbSuccessForTheseRules= - testSetOfRules(*((*ruleSet)->transitionUnit()), - (*ruleSet)->setOfRules(), - graph, current, begin, end,analysis, - result, &forbiddenTypes, - stopAtFirstSuccess, - onlyOneSuccessPerType, - applySameRuleWhileSuccess); - if (nbSuccessForTheseRules>0) { - nbSuccess+=nbSuccessForTheseRules; - // skip recognized part (if the end of the recognized part is after - // current token) - RecognizerMatch& lastSuccess=result.back(); - Token* t=get(vertex_token,*(graph.getGraph()),current); - uint64_t currentTokenEnd=t->position()+t->length(); - RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); - if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { - if (lastSuccess.positionEnd() >= currentTokenEnd) { - current=lastSuccess.getEnd(); - } - break; - } - } - } - for(std::vector::iterator it=matchingRules.begin(), - it_end=matchingRules.end(); it!=it_end; it++) { - if (*it!=0) { - delete (*it); - } - } - } - forbiddenTypes.clear(); - - // LDEBUG << "testOnVertex nb successes: " << nbSuccess; - return nbSuccess; -} - -//********************************************************************** -//resolve the problem of overlapping entities in the list of entities : -// when two entities are overlaping, only one is kept -//********************************************************************** -uint64_t Recognizer:: -resolveOverlappingEntities(std::vector& listEntities, - const OverlapResolutionStrategy& strategy) const -{ - typedef std::vector::iterator vectorRecognizerMatchIterator; - - uint64_t numberOfOverlappingEntities(0); - - if (listEntities.empty()) { - return numberOfOverlappingEntities; - } - - switch (strategy) { - case IGNORE_FIRST: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator nextEntity(currentEntity); - nextEntity++; - while (nextEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*nextEntity)) { - numberOfOverlappingEntities++; - currentEntity=listEntities.erase(currentEntity); - nextEntity=currentEntity; - nextEntity++; - } - else { - currentEntity++; - nextEntity++; - } - } - break; - } - case IGNORE_SECOND: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator previousEntity(currentEntity); - currentEntity++; - while (currentEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*previousEntity)) { - numberOfOverlappingEntities++; - currentEntity=listEntities.erase(currentEntity); - } - else { - previousEntity++; - currentEntity++; - } - } - break; - } - case IGNORE_SMALLEST: { - vectorRecognizerMatchIterator currentEntity(listEntities.begin()); - vectorRecognizerMatchIterator previousEntity(currentEntity); - currentEntity++; - while (currentEntity != listEntities.end()) { - if (currentEntity->isOverlapping(*previousEntity)) { - numberOfOverlappingEntities++; - if (currentEntity->numberOfElements() - < previousEntity->numberOfElements()) { // keep previous entity - currentEntity=listEntities.erase(currentEntity); - } - else { // keep current entity - previousEntity=listEntities.erase(previousEntity); - currentEntity=previousEntity; - currentEntity++; - } - } - else { - previousEntity++; - currentEntity++; - } - } - break; - } - default: - break; - } - - return numberOfOverlappingEntities; -} - -//********************************************************************** -// find the set of rules in the recognizer that accept -// a particular token as trigger -//********************************************************************** -void Recognizer:: -findNextSetOfRules(const LinguisticAnalysisStructure::AnalysisGraph& graph, - LinguisticGraphVertex& vertex, - AnalysisContent& analysis, - const LinguisticAnalysisStructure::Token* token, - const LinguisticAnalysisStructure::MorphoSyntacticData* data, - std::vector& matchingSetOfRules) const -{ - matchingSetOfRules.clear(); - - // find matching rules - std::vector matchingRules; - m_searchStructure.findMatchingTransitions(graph,vertex,analysis,token,data,matchingRules); - - // matching rules are gathered by common trigger (transition unit) - // we have to re-sort the rules by their weight at a global level, independently of the trigger - // create a vector of TriggerRule where each contains only one rule, then sort it - for (std::vector::const_iterator it=matchingRules.begin(),it_end=matchingRules.end();it!=it_end;it++) { - for (SetOfRules::const_iterator r=(*it)->setOfRules().begin(),r_end=(*it)->setOfRules().end(); r!=r_end;r++) { - matchingSetOfRules.push_back(new TriggerRule((*it)->transitionUnit(),SetOfRules(1,*r))); - } - } - sort(matchingSetOfRules.begin(),matchingSetOfRules.end(),CompareTriggerRule()); - - // then, gather rules with the same trigger that are consecutive in this new list - // (may save some constraint checking on trigger) - if (! matchingSetOfRules.empty()) { - std::vector::iterator it=matchingSetOfRules.begin(); - TransitionUnit* currentTrigger=(*it)->transitionUnit(); - std::vector::iterator next=it; - next++; - while (next!=matchingSetOfRules.end()) { - if ((*next)->transitionUnit() == currentTrigger) { - (*it)->second.push_back((*next)->setOfRules().front()); - delete *next; - next=matchingSetOfRules.erase(next); - } - else { - it++; - currentTrigger=(*it)->transitionUnit(); - next++; - } - } - } -} - -void Recognizer::initializeSearchStructure() { - const Common::PropertyCode::PropertyAccessor* macro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); - const Common::PropertyCode::PropertyAccessor* micro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); - m_searchStructure.init(m_rules,macro,micro); -} - -void Recognizer::clearSearchStructure() { - m_searchStructure.clear(); -} - -//********************************************************************** -// adding a rule -//********************************************************************** -uint64_t Recognizer::addRuleInStorage(Rule* rule) -{ - // add the rule in the storage - m_ruleStorage.push_back(rule); - // return the index of the rule in the storage - return (m_ruleStorage.size() - 1); -} - -uint64_t Recognizer::addRule(TransitionUnit* trigger, Rule* rule) -{ - uint64_t indexRule=addRuleInStorage(rule); - - // find if the trigger already exists in the set of triggers - for (uint64_t i(0); iclone(), - SetOfRules(1,rule))); - - return indexRule; -} - -void Recognizer::addRule(TransitionUnit* trigger, - const uint64_t index) -{ - // find if the trigger already exists in the set of triggers - for (uint64_t i(0); iclone(), - SetOfRules(1,m_ruleStorage[index]))); -} - -//********************************************************************** -// input/output in a binary format -//********************************************************************** -// void Recognizer::readFromTextFile(std::string filename) { -// RecognizerCompiler::buildRecognizer(*this,filename); -// } - -// simple linear search (called only with write function -> not optimized) -uint64_t Recognizer::findRuleIndex(Rule* r) const -{ - for (uint64_t i(0); i" << m_rules[i].first->printValue() << "" - << "" << i << "" << endl; - } -} - -//*************************************************************************** -// output -//*************************************************************************** -ostream& operator << (ostream& os, const Recognizer& r) -{ - for (uint64_t i(0); i +*/ +/************************************************************************ +* +* File : recognizer.cpp +* Author : Romaric Besancon (besanconr@zoe.cea.fr) +* Created on : Tue Oct 15 2002 +* Copyright : (c) 2002 by CEA +* +************************************************************************/ + +#include "recognizer.h" + +#include "transitionSearchStructure.h" +#include "automatonCommon.h" +#include "transitionUnit.h" +#include "recognizerData.h" +#include "common/tools/FileUtils.h" +#include "common/Data/LimaString.h" +#include "common/MediaticData/EntityType.h" +#include "common/MediaticData/mediaticData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::ApplyRecognizer; + +namespace Lima { +namespace LinguisticProcessing { +namespace Automaton { + +// a comparison operator on Rule pointer: +// to sort SetOfRules on decreasing rule weights +class CompareRulePtr { +public: + bool operator()(Rule* r1,Rule* r2) { + return (r1->getWeight() > r2->getWeight()); + } +}; + + +// a comparison operator on TriggerRule +class Recognizer::CompareTriggerRule { +public: + bool operator()(const Recognizer::TriggerRule* r1, + const Recognizer::TriggerRule* r2) { + return (r1->setOfRules().front()->getWeight() > + r2->setOfRules().front()->getWeight()); + } +}; + + +/** recognizer factory */ +SimpleFactory recognizerFactory(RECOGNIZER_CLASSID); + +//********************************************************************** +// constructors +//********************************************************************** +Recognizer::Recognizer(): + AbstractResource(), + m_rules(0), + m_ruleStorage(0), + m_language(), + m_automatonControlParams(), + m_filename(), + m_searchStructure() +{ } + +// copy is complex because of the pointers +Recognizer::Recognizer(const Recognizer& r): +AbstractResource(r) +{ + init(); + copy(r); + + // have to initialize the search structure of the new recognizer + initializeSearchStructure(); +} + +//********************************************************************** +// destructor +//********************************************************************** +Recognizer::~Recognizer() +{ + freeMem(); + clearSearchStructure(); +} + +//********************************************************************** +// copy +//********************************************************************** +Recognizer& Recognizer::operator = (const Recognizer& r) +{ + if (this != &r) + { + freeMem(); + init(); + copy(r); + } + + // do not copy the search structure : recompute it the new recognizer + // (not sure the copy is less complex than recomputing it) + initializeSearchStructure(); + + return (*this); +} + +void Recognizer::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + + /** @addtogroup ResourceConfiguration + * - <group name="..." class="AutomatonRecognizer"> + * -  rules : file containing the compiled rules of the recognizer + * -  maxDepthStack : maximum size of stack in depth-first-search + * when testing a rule (default is 100) + * -  maxTransitionsExplored : max number of transitions explored + * when testing a rule (default is 1000) + * -  maxNbResults : max number of results temporarily stored + * when testing a rule (default is 50) + * -  maxResultSize : max size of a result for a rule match + * (this parameter can be seen as the effective size of + * "n" when using {0-n} in a rule) (default is 200) + */ + + m_language=manager->getInitializationParameters().language; + string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); + try + { + QString rulesFile = unitConfiguration.getParamsValueAtKey("rules").c_str(); + if (!rulesFile.isEmpty()) + { + m_filename=rulesFile.toUtf8().constData(); + rulesFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), rulesFile); +// LDEBUG << "read recognizer from file : " << rulesFile; + //readFromFile(rulesFile); + AutomatonReader reader; + reader.readRecognizer(rulesFile.toUtf8().constData(),*this); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + AULOGINIT; + LERROR << "No param 'rules' in recognizer group for language " << (int)m_language; + throw InvalidConfiguration(); + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxDepthStack"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxDepthStack is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxDepthStack(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxTransitionsExplored"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxTransitionsExplored is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxTransitionsExplored(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxNbResults"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxNbResults is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxNbResults(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + try + { + string str=unitConfiguration.getParamsValueAtKey("maxResultSize"); + uint64_t val=atol(str.c_str()); + if (val==0) { + AULOGINIT; + LWARN << "maxResultSize is 0: keep default value"; + } + else { + m_automatonControlParams.setMaxResultSize(val); + } + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // keep default value + } + + Common::MediaticData::MediaticData::changeable().stringsPool(m_language).endResourcesPool(); +} + +//********************************************************************** +// helper functions for constructors and destructors +//********************************************************************** +void Recognizer::init() +{ + m_rules.clear(); + m_ruleStorage.clear(); + m_language=UNDEFLANG; + m_automatonControlParams=AutomatonControlParams(); +} + +void Recognizer::copy(const Recognizer& r) +{ + map pointersMap; + + for (uint64_t i(0); iclone(); + m_rules.push_back(TriggerRule(t,SetOfRules(0))); + for (uint64_t j(0); j results; + if (testSetOfRules(*(m_rules[offset].first), + m_rules[offset].second, + graph, + current, + graph.firstVertex(), + graph.lastVertex(), + analysis, + results)) + { + result=results.front(); // only one result because stopAtFirstSuccess=true + return true; + } + return false; +} +*/ + +//********************************************************************** +// test a set of rules for a trigger +uint64_t Recognizer::testSetOfRules(const TransitionUnit& trigger, + const SetOfRules& rules, + const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& position, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + vector& matches, + std::set* forbiddenTypes, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool applySameRuleWhileSuccess) const { + AULOGINIT; + // If the trigger is defined with a gazeteer, we must check the case of multi-term elements in the gazeteer + const GazeteerTransition* gazeteerTrigger = dynamic_cast(&trigger); + RecognizerMatch triggermatch(&graph); + LinguisticGraphVertex right=position; + if( gazeteerTrigger != 0 ) { + Token* token = get(vertex_token, *(graph.getGraph()), position); + MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), position); + deque vertices; + ForwardSearch searchGraph; + bool match = gazeteerTrigger->matchPath(graph, position, end, &searchGraph, analysis, token, vertices, data); + if( match ) { + for( std::deque::const_iterator vIt = vertices.begin(); vIt != vertices.end() ; vIt++ ) { + triggermatch.addBackVertex(*vIt,trigger.keep(),"trigger"); + } + } + } + else { + triggermatch.addBackVertex(position,trigger.keep(),"trigger"); + right=position; + } + + RecognizerMatch leftmatch(&graph); + RecognizerMatch rightmatch(&graph); + + if (onlyOneSuccessPerType && forbiddenTypes==0) { + LERROR << "Recognizer::testSetOfRules: cannot use onlyOneSuccessPerType " + << "when forbidden types are not allowed"; + onlyOneSuccessPerType=false; + } + + + uint64_t nbSuccess(0); + + // left context is same LinguisticAnalysisStructure::AnalysisGraph as current (current is in fact + // between the current token and the previous one) + LinguisticGraphVertex left=position; + +#ifdef DEBUG_LP + LDEBUG << "Recognizer::testSetOfRules: testing set of rules triggered by " << trigger << " on vertex " << position; + LDEBUG << "onlyOneSuccessPerType=" << onlyOneSuccessPerType; + if (logger.isDebugEnabled()) { + std::ostringstream oss; + for (SetOfRules::const_iterator it=rules.begin(),it_end=rules.end();it!=it_end;it++) { + oss << " - " << (*it)->getWeight(); + } + LDEBUG << "Rule weights" << oss.str(); + } +#endif + + bool reapplySameRule(false); + + SetOfRules::const_iterator + #ifdef ANTINNO_BUGFIX + // FWI 19/12/2013 : ajout définition de "rule_begin" + rule_begin=rules.begin(), +#endif + rule=rules.begin(), + rule_end=rules.end(); + for (; rule!=rule_end; rule++) { + Rule* currentRule=*rule; + +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LDEBUG << "Recognizer::testSetOfRules: testing rule "<<*currentRule << "," << currentRule->getRuleId() <<" of type " + << currentRule->getType() << ",reapply=" + << reapplySameRule << " from " << position; + } +#endif + + if (forbiddenTypes && + forbiddenTypes->find(currentRule->getType()) + != forbiddenTypes->end()) { + // type previously forbidden by a negative rule +/* LDEBUG << "type " << currentRule->getType() + << " is forbidden: continue";*/ + continue; + } + + // initializes the constraint checklist + ConstraintCheckList + constraintCheckList(currentRule->numberOfConstraints(), + ConstraintCheckListElement(graph)); + + // treat the constraints for the trigger with the constraint + // checklist corresponding to this rule + //Token* token=get(vertex_token,*(graph.getGraph()),position); +// LDEBUG << "Recognizer: checking trigger constraints: "; + + if (!trigger.checkConstraints(graph,position,analysis, + constraintCheckList)) { + // one unary constraint was not verified +// LDEBUG << "one unary constraint on trigger not verified"; + + // apply actions (for actions triggered by failure) + if (!currentRule->negative()) { + currentRule->executeActions(graph, analysis, + constraintCheckList, + false, + 0); // match is not used +// LDEBUG << "actionSuccess=" << actionSuccess; + } + continue; + } + + leftmatch.reinit(); + rightmatch.reinit(); + ForwardSearch forward; + BackwardSearch backward; + bool success = currentRule->test(graph, left, right, + begin, end, analysis, + leftmatch, rightmatch, + constraintCheckList,forward,backward, + m_automatonControlParams); + //LDEBUG << "success=" << success; + + RecognizerMatch* match=0; + + if (success) { + // build complete match + + match=new RecognizerMatch(leftmatch); + if (leftmatch.getHead() != 0) { + match->setHead(leftmatch.getHead()); + } + + // TODO: add node of gazeteerTrigger + //match->addBackVertex(position,trigger.keep(), "trigger"); + /* + RecognizerMatch::const_iterator triggerMatchIt = triggermatch.begin(); + for( ; triggerMatchIt != triggermatch.end(); triggerMatchIt++) { + match->addBackVertex(*triggerMatchIt,trigger.keep(), "trigger"); + } + */ + match->addBack(triggermatch); + match->addBack(rightmatch); + // remove elements not kept at begin and end of the expression + match->removeUnkeptAtExtremity(); + + // check if trigger is head + match->setType(currentRule->getType()); + match->setLinguisticProperties(currentRule->getLinguisticProperties()); + match->setContextual(currentRule->contextual()); + setNormalizedForm(currentRule->getNormalizedForm(),*match); + } + + // execute possible actions associated to the rule iff current rule is + // positive + //LDEBUG << "Recognizer: executing actions: "; + bool actionSuccess = true; + if (!currentRule->negative()) { + actionSuccess = currentRule->executeActions(graph, analysis, + constraintCheckList, + success, + match); + //LDEBUG << "actionSuccess=" << actionSuccess; + } + +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LinguisticGraphVertex v=position; + LimaString str(""); + Token* token=get(vertex_token,*(graph.getGraph()),position); + if (token!=0) { + str = token->stringForm(); + } + if (success) { + LDEBUG << "Recognizer::testSetOfRules: trigger " << v << "[" << str << "]:rule " + << currentRule->getRuleId() << "-> success=" << success + << ",actionSuccess=" << actionSuccess; + LDEBUG << " matched:" << match->getNormalizedString(Common::MediaticData::MediaticData::single().stringsPool(m_language)); + } + else { + LDEBUG << "Recognizer::testSetOfRules: vertex " << v << "[" << str << "]:rule " + << currentRule->getRuleId() << "-> success= false"; + } + } +#endif + + if (success && actionSuccess) { + if (forbiddenTypes && currentRule->negative()) { + forbiddenTypes->insert(currentRule->getType()); + success = false; + delete match; + match=0; + continue; + } + LINFO << "Recognizer::testSetOfRules: execute rule " << currentRule->getRuleId() + << " of type "<< currentRule->getType() + << "(" << Lima::Common::MediaticData::MediaticData::single().getEntityName(currentRule->getType()) + << ") on vertex " << position; + RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); + if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { + matches.push_back(*match); + delete match; // a copy has been made + match=0; +#ifdef DEBUG_LP + if (logger.isDebugEnabled()) { + LDEBUG << "Recognizer::testSetOfRules: Returning from testSetOfRules cause stopAtFirstSuccess (" + << stopAtFirstSuccess << ") or next vertices empty (" + << (recoData->getNextVertices().empty()) + << ")"; + } +#endif + return 1; + } + else { + if (applySameRuleWhileSuccess) { + if (reapplySameRule) { + if (*match==matches.back()) { +// AULOGINIT; +// LDEBUG << "Reapplication of same rule gives same result: " +// << "abort to avoid inifinite loop: " +// << *match << ";" << matches.back(); + delete match; // a copy has been made + match=0; + reapplySameRule=false; + continue; + } +/* else { + LDEBUG << "Reapplication of same rule gives new result"; + }*/ + } + // reapply same rule + #ifdef ANTINNO_BUGFIX + // FWI 19/12/2013 : ajout test pour ne faire le -- que si nécessaire + if (rule != rule_begin) +#endif + rule--; + + reapplySameRule=true; + } + +// LDEBUG << "add match to results " << *match; + matches.push_back(*match); + delete match; // a copy has been made + match=0; + + if (onlyOneSuccessPerType) { +/* LDEBUG << "add " << currentRule->getType() + << " in forbiddenTypes";*/ + forbiddenTypes->insert(currentRule->getType()); + } + nbSuccess++; + } + } + else { +// LDEBUG << "-> no success"; + reapplySameRule=false; + } + + if (match !=0) { + delete match; + } + } + + return nbSuccess; +} + +//********************************************************************** +// normalization function +//********************************************************************** +void Recognizer:: +setNormalizedForm(const LimaString& norm, + RecognizerMatch& match) const +{ +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + AULOGINIT +#endif +#endif + + match.features().clear(); + + const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); + if (norm.isEmpty()) { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "Recognizer::setNormalizedForm(norm=""): match.getNormalizedString(sp)= " << match.getNormalizedString(sp); +#endif +#endif + // use surface form of the expression as normalized form + match.features().setFeature(DEFAULT_ATTRIBUTE,match.getNormalizedString(sp)); + } + else { +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LDEBUG << "Recognizer::setNormalizedForm(norm): norm= " << norm; +#endif +#endif + match.features().setFeature(DEFAULT_ATTRIBUTE,norm); + } +} + +//********************************************************************** +// main functions that applies the recognizer on a graph +//********************************************************************** + +// Apply between two nodes and search between the same ones +uint64_t Recognizer:: + apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + std::vector& result, + bool testAllVertices, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool returnAtFirstSuccess, + bool applySameRuleWhileSuccess) const +{ + return apply(graph, + begin, + end, + begin, + end, + analysis, + result, + testAllVertices, + stopAtFirstSuccess, + onlyOneSuccessPerType, + returnAtFirstSuccess, + applySameRuleWhileSuccess); +} + +// Apply between two nodes and search between two others. +// precondition [begin, end] included in [upstreamBound,downstreamBound] +uint64_t Recognizer:: + apply(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + const LinguisticGraphVertex& upstreamBound, + const LinguisticGraphVertex& downstreamBound, + AnalysisContent& analysis, + std::vector& result, + bool testAllVertices, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool returnAtFirstSuccess, + bool applySameRuleWhileSuccess) const +{ +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + if (returnAtFirstSuccess) { + stopAtFirstSuccess=true; // implied by the other + } + +#ifdef DEBUG_LP + AULOGINIT; + LDEBUG << "apply recognizer " << m_filename << " from vertex " + << begin << " to vertex " << end; + LDEBUG << " up bound: " << upstreamBound << "; down bound: " << downstreamBound << "; testAllVertices: " << testAllVertices; + LDEBUG << " stopAtFirstSuccess: " << stopAtFirstSuccess << "; onlyOneSuccessPerType: " << onlyOneSuccessPerType; + LDEBUG << " returnAtFirstSuccess: " << returnAtFirstSuccess << "; applySameRuleWhileSuccess: " << applySameRuleWhileSuccess; +#endif + + uint64_t numberOfRecognized(0); + bool success(false); + + // use deque instead of queue to be able to clear() + std::deque toVisit; + std::set visited; + + toVisit.push_back(begin); + // patch for inifinite loop : avoid begin stopped at first step + //visited.insert(begin); + + bool lastReached = false; + while (!toVisit.empty()) + { + LinguisticGraphVertex currentVertex=toVisit.front(); + toVisit.pop_front(); + // patch for inifinite loop : check if we already seen this node + if (visited.find(currentVertex) != visited.end()) + { + continue; + } + + visited.insert(currentVertex); +#ifdef DEBUG_LP + LDEBUG << "to visit size=" << toVisit.size() << " ; currentVertex=" << currentVertex; +#endif + + if (lastReached || // limit given by argument + currentVertex == graph.lastVertex()) { // end of the graph + // LDEBUG << "vertex " << currentVertex << " is last vertex"; + continue; // may be other nodes to test in queue + } + if (currentVertex == end ) { // limit given by argument + lastReached = true; + } + + if (currentVertex != graph.firstVertex()) { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: test on vertex " << currentVertex; +#endif +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { +#if !defined DEBUG_LP + AULOGINIT; +#endif + LERROR << "Stopped in Recognizer"; + return 0; + } +#endif + success = testOnVertex(graph,currentVertex, + upstreamBound,downstreamBound, + analysis,result, + stopAtFirstSuccess, + onlyOneSuccessPerType, + applySameRuleWhileSuccess); + if (success) { + numberOfRecognized++; + if (returnAtFirstSuccess) + return numberOfRecognized; + if (! testAllVertices) { // restart from end of recognized expression +#ifdef DEBUG_LP + LDEBUG << "success: continue from vertex " << currentVertex; +#endif + // GC on 20110803: the clearing below was problematic in case of rules like that: + // []:(t_capital_1st|t_capital){1-3} [,]::LOCATION:N_LOCATION + // which matches text before (left) the trigger which is not included in the match. + // thus the next vertex explored was the newly created one ; the vertex following + // it is already visited (this is in this case the comma) and the content of + // toVisit (the vertex after the trigger) was removed. Thus the search stopped after + // the new vertex. + // Warning: what is the inpact on the use of the testAllVertices parameter ? And is there + // any other side effect ? +// toVisit.clear(); + + } + } + } + + // store following nodes to test + LinguisticGraphOutEdgeIt outEdge,outEdge_end; + boost::tie (outEdge,outEdge_end)=out_edges(currentVertex,*(graph.getGraph())); + + for (; outEdge!=outEdge_end; outEdge++) { + LinguisticGraphVertex next=target(*outEdge,*(graph.getGraph())); + if (visited.find(next)==visited.end()) { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: adding out edge target vertex to the 'to visit' list: " << next; +#endif + toVisit.push_back(next); + // do not put in visited unless it is really visited + // (otherwise, may be suppressed when testAllVertices is false + // and never visited) + //visited.insert(next); + } + else { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: already visited:" << next; +#endif + } + } + RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + std::set& nextVertices = recoData->getNextVertices(); + if (recoData != 0 && !nextVertices.empty() ) + { +#ifdef DEBUG_LP + LDEBUG << "Recognizer: adding next vertices to the 'to visit' list"; +#endif + std::set< LinguisticGraphVertex >::const_iterator nvit, nvit_end; + nvit = nextVertices.begin(); + nvit_end = nextVertices.end(); + for (; nvit != nvit_end; nvit++) + { +#ifdef DEBUG_LP + LDEBUG << " - " << *nvit; +#endif + toVisit.push_front(*nvit); + } + nextVertices.clear(); + } +#ifdef DEBUG_LP + LDEBUG << "Recognizer: 'to visit' list size is now: " << toVisit.size(); +#endif + } + return numberOfRecognized; +} + + +//********************************************************************** +// test the recognizer on a vertex : test +//********************************************************************** +uint64_t Recognizer:: +testOnVertex(const LinguisticAnalysisStructure::AnalysisGraph& graph, + LinguisticGraphVertex& current, + const LinguisticGraphVertex& begin, + const LinguisticGraphVertex& end, + AnalysisContent& analysis, + std::vector& result, + bool stopAtFirstSuccess, + bool onlyOneSuccessPerType, + bool applySameRuleWhileSuccess) const +{ + //AULOGINIT; + Token* token = get(vertex_token, *(graph.getGraph()), current); + MorphoSyntacticData* data = get(vertex_data, *(graph.getGraph()), current); + + if (token==0) { + AULOGINIT; + LERROR << "no token for vertex " << current; + return 0; + } + + if (data==0) { + AULOGINIT; + LERROR << "no data for vertex " << current; + return 0; + } + + vector matchingRules; + set forbiddenTypes; + uint64_t nbSuccess=0; + + findNextSetOfRules(graph, current, analysis, token, data, matchingRules); + + if (! matchingRules.empty()) { + std::vector::const_iterator + ruleSet=matchingRules.begin(), + ruleSet_end=matchingRules.end(); + for (; ruleSet!=ruleSet_end; ruleSet++) { + uint64_t nbSuccessForTheseRules= + testSetOfRules(*((*ruleSet)->transitionUnit()), + (*ruleSet)->setOfRules(), + graph, current, begin, end,analysis, + result, &forbiddenTypes, + stopAtFirstSuccess, + onlyOneSuccessPerType, + applySameRuleWhileSuccess); + if (nbSuccessForTheseRules>0) { + nbSuccess+=nbSuccessForTheseRules; + // skip recognized part (if the end of the recognized part is after + // current token) + RecognizerMatch& lastSuccess=result.back(); + Token* t=get(vertex_token,*(graph.getGraph()),current); + uint64_t currentTokenEnd=t->position()+t->length(); + RecognizerData* recoData = static_cast(analysis.getData("RecognizerData")); + if (stopAtFirstSuccess||(recoData != 0 && !recoData->getNextVertices().empty())) { + if (lastSuccess.positionEnd() >= currentTokenEnd) { + current=lastSuccess.getEnd(); + } + break; + } + } + } + for(std::vector::iterator it=matchingRules.begin(), + it_end=matchingRules.end(); it!=it_end; it++) { + if (*it!=0) { + delete (*it); + } + } + } + forbiddenTypes.clear(); + + // LDEBUG << "testOnVertex nb successes: " << nbSuccess; + return nbSuccess; +} + +//********************************************************************** +//resolve the problem of overlapping entities in the list of entities : +// when two entities are overlaping, only one is kept +//********************************************************************** +uint64_t Recognizer:: +resolveOverlappingEntities(std::vector& listEntities, + const OverlapResolutionStrategy& strategy) const +{ + typedef std::vector::iterator vectorRecognizerMatchIterator; + + uint64_t numberOfOverlappingEntities(0); + + if (listEntities.empty()) { + return numberOfOverlappingEntities; + } + + switch (strategy) { + case IGNORE_FIRST: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator nextEntity(currentEntity); + nextEntity++; + while (nextEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*nextEntity)) { + numberOfOverlappingEntities++; + currentEntity=listEntities.erase(currentEntity); + nextEntity=currentEntity; + nextEntity++; + } + else { + currentEntity++; + nextEntity++; + } + } + break; + } + case IGNORE_SECOND: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator previousEntity(currentEntity); + currentEntity++; + while (currentEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*previousEntity)) { + numberOfOverlappingEntities++; + currentEntity=listEntities.erase(currentEntity); + } + else { + previousEntity++; + currentEntity++; + } + } + break; + } + case IGNORE_SMALLEST: { + vectorRecognizerMatchIterator currentEntity(listEntities.begin()); + vectorRecognizerMatchIterator previousEntity(currentEntity); + currentEntity++; + while (currentEntity != listEntities.end()) { + if (currentEntity->isOverlapping(*previousEntity)) { + numberOfOverlappingEntities++; + if (currentEntity->numberOfElements() + < previousEntity->numberOfElements()) { // keep previous entity + currentEntity=listEntities.erase(currentEntity); + } + else { // keep current entity + previousEntity=listEntities.erase(previousEntity); + currentEntity=previousEntity; + currentEntity++; + } + } + else { + previousEntity++; + currentEntity++; + } + } + break; + } + default: + break; + } + + return numberOfOverlappingEntities; +} + +//********************************************************************** +// find the set of rules in the recognizer that accept +// a particular token as trigger +//********************************************************************** +void Recognizer:: +findNextSetOfRules(const LinguisticAnalysisStructure::AnalysisGraph& graph, + LinguisticGraphVertex& vertex, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data, + std::vector& matchingSetOfRules) const +{ + matchingSetOfRules.clear(); + + // find matching rules + std::vector matchingRules; + m_searchStructure.findMatchingTransitions(graph,vertex,analysis,token,data,matchingRules); + + // matching rules are gathered by common trigger (transition unit) + // we have to re-sort the rules by their weight at a global level, independently of the trigger + // create a vector of TriggerRule where each contains only one rule, then sort it + for (std::vector::const_iterator it=matchingRules.begin(),it_end=matchingRules.end();it!=it_end;it++) { + for (SetOfRules::const_iterator r=(*it)->setOfRules().begin(),r_end=(*it)->setOfRules().end(); r!=r_end;r++) { + matchingSetOfRules.push_back(new TriggerRule((*it)->transitionUnit(),SetOfRules(1,*r))); + } + } + sort(matchingSetOfRules.begin(),matchingSetOfRules.end(),CompareTriggerRule()); + + // then, gather rules with the same trigger that are consecutive in this new list + // (may save some constraint checking on trigger) + if (! matchingSetOfRules.empty()) { + std::vector::iterator it=matchingSetOfRules.begin(); + TransitionUnit* currentTrigger=(*it)->transitionUnit(); + std::vector::iterator next=it; + next++; + while (next!=matchingSetOfRules.end()) { + if ((*next)->transitionUnit() == currentTrigger) { + (*it)->second.push_back((*next)->setOfRules().front()); + delete *next; + next=matchingSetOfRules.erase(next); + } + else { + it++; + currentTrigger=(*it)->transitionUnit(); + next++; + } + } + } +} + +void Recognizer::initializeSearchStructure() { + const Common::PropertyCode::PropertyAccessor* macro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); + const Common::PropertyCode::PropertyAccessor* micro=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); + m_searchStructure.init(m_rules,macro,micro); +} + +void Recognizer::clearSearchStructure() { + m_searchStructure.clear(); +} + +//********************************************************************** +// adding a rule +//********************************************************************** +uint64_t Recognizer::addRuleInStorage(Rule* rule) +{ + // add the rule in the storage + m_ruleStorage.push_back(rule); + // return the index of the rule in the storage + return (m_ruleStorage.size() - 1); +} + +uint64_t Recognizer::addRule(TransitionUnit* trigger, Rule* rule) +{ + uint64_t indexRule=addRuleInStorage(rule); + + // find if the trigger already exists in the set of triggers + for (uint64_t i(0); iclone(), + SetOfRules(1,rule))); + + return indexRule; +} + +void Recognizer::addRule(TransitionUnit* trigger, + const uint64_t index) +{ + // find if the trigger already exists in the set of triggers + for (uint64_t i(0); iclone(), + SetOfRules(1,m_ruleStorage[index]))); +} + +//********************************************************************** +// input/output in a binary format +//********************************************************************** +// void Recognizer::readFromTextFile(std::string filename) { +// RecognizerCompiler::buildRecognizer(*this,filename); +// } + +// simple linear search (called only with write function -> not optimized) +uint64_t Recognizer::findRuleIndex(Rule* r) const +{ + for (uint64_t i(0); i" << m_rules[i].first->printValue() << "" + << "" << i << "" << endl; + } +} + +//*************************************************************************** +// output +//*************************************************************************** +ostream& operator << (ostream& os, const Recognizer& r) +{ + for (uint64_t i(0); i(name,value); } + void addVertexAsEmbededEntity(const LinguisticGraphVertex& vertex) + { + m_embededEntities.insert(vertex); + } + bool hasVertexAsEmbededEntity(const LinguisticGraphVertex& vertex) const + { + return (m_embededEntities.find(vertex) != m_embededEntities.end()); + } void clearEntityFeatures(); Automaton::EntityFeatures& getEntityFeatures() { return m_entityFeatures; } @@ -155,6 +163,8 @@ class LIMA_AUTOMATON_EXPORT RecognizerData : public AnalysisData // EntityFeatures : for functions to add features Automaton::EntityFeatures m_entityFeatures; + // embededEntities : set of embeded entities + std::set< LinguisticGraphVertex > m_embededEntities; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp index d3f227f8e..b064aa7c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/recognizerMatch.cpp @@ -211,6 +211,13 @@ LimaString RecognizerMatch::getNormalizedString(const FsaStringsPool& sp) const v != m_graph->lastVertex()) { if ((*i).isKept()) { Token* t = get(vertex_token,*(m_graph->getGraph()),v); +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LOGINIT("LP::Automaton"); + LDEBUG << "RecognizerMatch::getNormalizedString(...) token.form(): " << t->form(); + LDEBUG << "RecognizerMatch::getNormalizedString(...) token.stringForm(): " << t->stringForm(); +#endif +#endif if (t->status().isAlphaHyphen()) { firstHyphenPassed = true; @@ -222,6 +229,12 @@ LimaString RecognizerMatch::getNormalizedString(const FsaStringsPool& sp) const } else { // take first norm +#ifdef ANTINNO_SPECIFIC +#ifdef DEBUG_LP + LOGINIT("LP::Automaton"); + LDEBUG << "RecognizerMatch::getNormalizedString(...) data->front().normalizedForm: " << data->front().normalizedForm; +#endif +#endif str += sp[data->front().normalizedForm]; } currentPosition=t->position()+t->length(); @@ -291,6 +304,9 @@ isOverlapping(const RecognizerMatch& otherMatch) const { //********************************************************************** void RecognizerMatch::addBackVertex(const LinguisticGraphVertex& v, bool isKept, const LimaString& ruleElementId ) { + AULOGINIT; + LDEBUG << "RecognizerMatch:addBackVertex(v:" << v << ", isKept:" << isKept << ", ruleElmtId:" << ruleElementId << ")"; + push_back(MatchElement(v,isKept, ruleElementId)); } @@ -303,6 +319,8 @@ void RecognizerMatch::popBackVertex() { void RecognizerMatch::addFrontVertex(const LinguisticGraphVertex& v, bool isKept, const LimaString& ruleElementId) { + AULOGINIT; + LDEBUG << "RecognizerMatch:addFrontVertex(v:" << v << ", isKept:" << isKept << ", ruleElmtId:" << ruleElementId << ")"; insert(begin(),MatchElement(v,isKept,ruleElementId)); } @@ -314,10 +332,16 @@ void RecognizerMatch::popFrontVertex() { } void RecognizerMatch::addBack(const RecognizerMatch& l) { + if( l.getHead() != 0 ){ + setHead(l.getHead()); + } insert(end(),l.begin(),l.end()); } void RecognizerMatch::addFront(const RecognizerMatch& l) { + if( l.getHead() != 0 ){ + setHead(l.getHead()); + } insert(begin(),l.begin(),l.end()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp index 9ebb67b53..d887ad8c7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/rule.cpp @@ -302,7 +302,7 @@ bool Rule::executeActions(const LinguisticAnalysisStructure::AnalysisGraph& grap LDEBUG << "Rule::executeActions: check vertex " << matchElmt->m_elem.first << " with " << matchElmt->getRuleElemtId(); #endif - if( matchElmt->getRuleElemtId() == ruelElemtId ) { + if( (matchElmt->getRuleElemtId()).startsWith(ruelElemtId) ) { #ifdef DEBUG_LP LDEBUG << "Rule::executeActions: found " << matchElmt->m_elem.first; #endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h index b400f3f54..b6b72141f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionSearchStructure.h @@ -37,6 +37,9 @@ #include "transitionUnit.h" #include "automatonCommon.h" #include +#ifdef ANTINNO_SPECIFIC +#include "searchGraph.h" +#endif namespace Lima { namespace LinguisticProcessing { @@ -67,6 +70,15 @@ class TransitionSearchStructure const LinguisticAnalysisStructure::Token* token, const LinguisticAnalysisStructure::MorphoSyntacticData* data, std::vector& matchingSetOfRules) const; + uint64_t + findMatchingTransitions2(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + const LinguisticGraphVertex& limit, + SearchGraph* searchGraph, + AnalysisContent& analysis, + const LinguisticAnalysisStructure::Token* token, + const LinguisticAnalysisStructure::MorphoSyntacticData* data, + std::vector,const TargetType*> >& matchingSetOfRules) const; // for debug only void printStructure(std::ostream& os) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h index f52023641..7c11cc1ce 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Automaton/transitionUnit.h @@ -56,7 +56,9 @@ typedef enum { T_AND, T_SET, T_DEACCENTUATED, - T_ENTITY + T_ENTITY, + T_ENTITY_GROUP, + T_GAZETEER } TypeTransition; // useful for the read/write functions class LIMA_AUTOMATON_EXPORT TransitionUnit diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp index c344bc0a9..7a729cd9c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.cpp @@ -33,6 +33,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/Data/strwstrtools.h" #include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/LinguisticProcessingCommon.h" #include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" #include "common/MediaProcessors/MediaProcessors.h" @@ -42,6 +43,7 @@ #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" #include +#include uint64_t t1; @@ -58,15 +60,13 @@ namespace Lima namespace LinguisticProcessing { -CoreLinguisticProcessingClientFactory* CoreLinguisticProcessingClientFactory::s_instance=new CoreLinguisticProcessingClientFactory(); +std::unique_ptr CoreLinguisticProcessingClientFactory::s_instance=std::unique_ptr(new CoreLinguisticProcessingClientFactory()); CoreLinguisticProcessingClient::CoreLinguisticProcessingClient() {} CoreLinguisticProcessingClient::~CoreLinguisticProcessingClient() { - delete LinguisticResources::pchangeable(); - delete MediaProcessors::pchangeable(); } void CoreLinguisticProcessingClient::analyze( @@ -74,12 +74,19 @@ void CoreLinguisticProcessingClient::analyze( const std::map& metaData, const std::string& pipelineId, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits, StopAnalyze const& stopAnalyze) const +#else const std::set& inactiveUnits) const - +#endif { LimaString limatexte=Common::Misc::utf8stdstring2limastring(texte); - +#ifdef ANTINNO_SPECIFIC + analyze(limatexte,metaData,pipelineId,handlers,inactiveUnits, stopAnalyze); +#else analyze(limatexte,metaData,pipelineId,handlers,inactiveUnits); +#endif + } void CoreLinguisticProcessingClient::analyze( @@ -87,13 +94,21 @@ void CoreLinguisticProcessingClient::analyze( const std::map& metaData, const std::string& pipelineId, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits, StopAnalyze const& stopAnalyze) const +#else const std::set& inactiveUnits) const +#endif { Lima::TimeUtilsController timer("CoreLinguisticProcessingClient::analyze"); CORECLIENTLOGINIT; // create analysis content +#ifdef ANTINNO_SPECIFIC + AnalysisContent analysis(stopAnalyze); +#else AnalysisContent analysis; +#endif LinguisticMetaData* metadataholder=new LinguisticMetaData(); // will be destroyed in AnalysisContent destructor analysis.setData("LinguisticMetaData",metadataholder); @@ -258,7 +273,6 @@ void CoreLinguisticProcessingClientFactory::configure( } } - string configPath=Common::MediaticData::MediaticData::single().getConfigPath(); for (deque::const_iterator langItr=langToload.begin(); langItr!=langToload.end(); langItr++) @@ -268,17 +282,30 @@ void CoreLinguisticProcessingClientFactory::configure( string file; try { - file=configPath + "/" + configuration.getModuleGroupParamValue( + QStringList configPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + QString mediaProcessingDefinitionFile = QString::fromUtf8(configuration.getModuleGroupParamValue( "lima-coreclient", "mediaProcessingDefinitionFiles", - *langItr); + *langItr).c_str()); + if (QFileInfo(confPath + "/" + mediaProcessingDefinitionFile).exists()) + { + file= (confPath + "/" + mediaProcessingDefinitionFile).toUtf8().constData(); + break; + } + } } catch (NoSuchParam& ) { LERROR << "no language definition file for language " << *langItr; throw InvalidConfiguration("no language definition file for language "); } - + if (file.empty()) + { + LERROR << "no language definition file for language " << *langItr; + throw InvalidConfiguration("no language definition file for language "); + } XMLConfigurationFileParser langParser(file); //initialize SpecificEntities @@ -331,9 +358,9 @@ void CoreLinguisticProcessingClientFactory::configure( } } -AbstractLinguisticProcessingClient* CoreLinguisticProcessingClientFactory::createClient() const +std::shared_ptr< AbstractProcessingClient > CoreLinguisticProcessingClientFactory::createClient() const { - return new CoreLinguisticProcessingClient(); + return std::shared_ptr< AbstractProcessingClient >(new CoreLinguisticProcessingClient()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h index 3e1ee072c..9770305ad 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CoreLinguisticProcessingClient.h @@ -49,15 +49,23 @@ class LIMA_CORELINGUISTICPROCESSINGCLIENT_EXPORT CoreLinguisticProcessingClient const std::map& metaData, const std::string& pipeline, const std::map& handlers, - const std::set& inactiveUnits = std::set()) const - ; +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const +#else + const std::set& inactiveUnits = std::set()) const +#endif +; void analyze(const std::string& texte, const std::map& metaData, const std::string& pipeline, const std::map& handlers, +#ifdef ANTINNO_SPECIFIC + const std::set& inactiveUnits = std::set(), Lima::StopAnalyze const& stopAnalyze = Lima::defaultStopAnalyze) const +#else const std::set& inactiveUnits = std::set()) const - ; +#endif +; }; class CoreLinguisticProcessingClientFactory : public AbstractLinguisticProcessingClientFactory @@ -70,13 +78,13 @@ class CoreLinguisticProcessingClientFactory : public AbstractLinguisticProcessin std::deque langs, std::deque pipelines); - AbstractLinguisticProcessingClient* createClient() const; + std::shared_ptr< AbstractProcessingClient > createClient() const; virtual ~CoreLinguisticProcessingClientFactory(); private: CoreLinguisticProcessingClientFactory(); - static CoreLinguisticProcessingClientFactory* s_instance; + static std::unique_ptr s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp index 077ef398c..6ff6e112e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/corefSolver.cpp @@ -292,7 +292,7 @@ LimaStatusCode CorefSolver::process( * function */ if (annotationData->dumpFunction("Coreferent") == 0) { - annotationData->dumpFunction("Coreferent", new DumpCoreferent()); + annotationData->dumpFunction("Coreferent", new DumpCoreferent(annotationData)); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp index 7ca3bb1e4..35b720d4b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.cpp @@ -55,7 +55,7 @@ int DumpCoreferent::dump(std::ostream& os, Common::AnnotationGraphs::GenericAnno PROCESSORSLOGINIT; try { - ga.value().dump(os); + ga.value().dump(os, m_ad); return SUCCESS_ID; } catch (const boost::bad_any_cast& ) @@ -1111,8 +1111,8 @@ bool CoreferentAnnotation::aba5( // COREFSOLVERLOGINIT; // LDEBUG << "aba5"; LinguisticGraph* graph = anagraph->getGraph(); - LinguisticCode L_NC = (*tagLocalDef.find("NomCommunMacroCategory")).second; - LinguisticCode L_NP = (*tagLocalDef.find("NomPropreMacroCategory")).second; + LinguisticCode NC = (*tagLocalDef.find("NomCommunMacroCategory")).second; + LinguisticCode NP = (*tagLocalDef.find("NomPropreMacroCategory")).second; bool res = false; DependencyGraphVertex* qv = new DependencyGraphVertex(); if (ca.isDeterminer(qv,sd, relLocalDef, language, anagraph, ac)) @@ -1120,7 +1120,7 @@ bool CoreferentAnnotation::aba5( MorphoSyntacticData* data = get(vertex_data,*graph,sd->tokenVertexForDepVertex(*qv)); if (data ==0 || data->empty()) { return false; }; // if *qv is a noun - if (data->firstValue(*macroAccessor) == L_NC || data->firstValue(*macroAccessor) == L_NP) + if (data->firstValue(*macroAccessor) == NC || data->firstValue(*macroAccessor) == NP) { // if Q is in the argument domain of N, CoreferentAnnotation caQ(0,*qv); @@ -1307,6 +1307,49 @@ AnnotationGraphVertex CoreferentAnnotation::writeAnnotation( return AnnotationGraphVertex(); //unused; } +DumpCoreferent::DumpCoreferent(const Lima::Common::AnnotationGraphs::AnnotationData* ad) : + Common::AnnotationGraphs::AnnotationData::Dumper(), + m_ad(ad) +{ +} + +void CoreferentAnnotation::dump(std::ostream& os, const Common::AnnotationGraphs::AnnotationData* ad) const +{ + os << "#" << m_id << ";" << m_categ<< ";" /*<< "V:" << m_morphVertex */; + CoreferentAnnotation antecedent; + bool hasAntecedent = false; + std::set< AnnotationGraphVertex > matches = ad->matches("PosGraph",m_morphVertex,"annot"); + if (matches.empty()) + { + COREFSOLVERLOGINIT; + LERROR << "CoreferentAnnotation::dump No annotation graph vertex matches PoS graph vertex " << m_morphVertex << ". This should not happen."; + return ; + } + AnnotationGraphVertex av = *matches.begin(); + AnnotationGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = boost::out_edges(av, ad->getGraph()); + if (it != it_end) + { + for (; it != it_end; it++) + { + GenericAnnotation ga = ad->annotation(boost::target(*it, ad->getGraph()), utf8stdstring2limastring("Coreferent")); + try + { + antecedent = ga.value(); + hasAntecedent = true; + break; + } + catch (const boost::bad_any_cast& ) + { + continue; + } + } + } + if (hasAntecedent) + { + os << "#" << antecedent.id(); + } +} void CoreferentAnnotation::outputXml(std::ostream& xmlStream,const LinguisticGraph& g, const AnnotationData* ad) const { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h index 238970472..7134fdabc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/CorefSolving/coreferentAnnotation.h @@ -113,7 +113,7 @@ typedef std::map< CoreferentAnnotation*,std::map > inline void morphVertex(LinguisticGraphVertex v); inline void av(AnnotationGraphVertex av); inline void newerRef(CoreferentAnnotation* newerRef); - inline void dump(std::ostream& os); + void dump( std::ostream& os, const Lima::Common::AnnotationGraphs::AnnotationData* ad ) const; inline bool hasNewerRef(/*std::deque* npCandidates*/); /** general test functions */ @@ -514,10 +514,6 @@ inline void CoreferentAnnotation::newerRef(CoreferentAnnotation* newerRef) { m_newerRef = newerRef; } -inline void CoreferentAnnotation::dump(std::ostream& os) -{ - os << "#" << m_id << ";" << m_categ<< ";" << /*"V:" << m_morphVertex <<*/ "\n"; -} inline bool CoreferentAnnotation::hasNewerRef() { return (newerRef()!=this); @@ -537,8 +533,13 @@ return (newerRef()!=this); */ class DumpCoreferent : public Common::AnnotationGraphs::AnnotationData::Dumper { - public: - virtual int dump(std::ostream& os, Common::AnnotationGraphs::GenericAnnotation& ga) const; +public: + DumpCoreferent(const Lima::Common::AnnotationGraphs::AnnotationData* ad); + virtual int dump(std::ostream& os, Common::AnnotationGraphs::GenericAnnotation& ga) const; + +private: + const Lima::Common::AnnotationGraphs::AnnotationData* m_ad; + }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp index 2431495f6..4c569590f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Dictionary/DictionaryCode.cpp @@ -30,6 +30,7 @@ #include "common/linguisticData/languageData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/misc/FileUtils.h" #include "common/misc/strwstrtools.h" // #include "linguisticProcessing/core/Tokenizer/Exceptions.h" @@ -96,20 +97,22 @@ void DictionaryCode::init( #endif m_language=manager->getInitializationParameters().language; std::string resourcesPath=Common::LinguisticData::LinguisticData::single().getResourcesPath(); - std::string codesListFileName; - std::string codeFileName; - try - { - codesListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("codeListFile"); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'codeListFile' in DictionaryCode group for language " << (int) m_language; - throw InvalidConfiguration(); - } +// QString codesListFileName; +// try +// { +// codesListFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("codeListFile").c_str()); +// } +// catch (NoSuchParam& ) +// { +// LERROR << "no param 'codeListFile' in DictionaryCode group for language " << (int) m_language; +// throw InvalidConfiguration(); +// } +// loadCodesMaps(codesListFileName); + + QString codeFileName; try { - codeFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("codeFile"); + codeFileName = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("codeFile").c_str()); } catch (NoSuchParam& ) { @@ -117,8 +120,7 @@ void DictionaryCode::init( throw InvalidConfiguration(); } -// loadCodesMaps(codesListFileName); - parse(codeFileName); + parse(codeFileName.toUtf8().constData()); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp index 9289b5656..51f21430b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplate.cpp @@ -38,6 +20,13 @@ m_mainEvent(false) { } +EventTemplate::EventTemplate(const std::string type): +m_template(), +m_weight(0.0), +m_type(type), +m_mainEvent(false) +{ +} EventTemplate::~EventTemplate() { } @@ -61,7 +50,7 @@ const EventTemplateElement& EventTemplate::getElement(const std::string& role) c it=m_template.find(role); if (it==m_template.end()) { LOGINIT("LP::EventAnalysis"); - LERROR << "No element '" << role << "' in EventTemplate"; + LERROR << "No element '" << role << "' in EventTemplate" << LENDL; return emptyElement; } return (*it).second; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h index 070315a14..070f6c11e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplate.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplate.h @@ -46,11 +28,13 @@ typedef std::string EventRole; class LIMA_EVENTANALISYS_EXPORT EventTemplate { public: - EventTemplate(); + EventTemplate(); + EventTemplate(const std::string); ~EventTemplate(); void addElement(const std::string& role, const EventTemplateElement& elt); void setWeight(double w) { m_weight=w; } + void setType(const std::string type) { m_type=type; } void setMain(bool isMainEvent) { m_mainEvent=isMainEvent; } void clear(); @@ -58,12 +42,14 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplate const std::map& getTemplateElements() const { return m_template; } std::map& getTemplateElements() { return m_template; } double getWeight() const { return m_weight; } + const std::string getType() const { return m_type; } bool isMainEvent() const { return m_mainEvent; } private: std::map m_template; double m_weight; bool m_mainEvent; + std::string m_type; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp index b5a5ccfc7..06961be37 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.cpp @@ -67,6 +67,14 @@ void EventTemplateData::clearCurrentTemplate() back().clear(); } +void EventTemplateData::setTypeInCurrentTemplate(const std::string& type) +{ + LOGINIT("LP::EventAnalysis"); + LDEBUG << "set Current Template Type " << type << LENDL; + back().setType(type); + LDEBUG << "bak.getType " << back().getType() << LENDL; +} + //------------------------------------------------------------------------------- // conversion to Events (for compatibility with EventExtraction web service) Events* EventTemplateData:: diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h index d9a29adad..7469eb71a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateData.h @@ -51,6 +51,7 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateData : public AnalysisData, public void addTemplate(); void addElementInCurrentTemplate(const std::string& role, const EventTemplateElement& elt); void clearCurrentTemplate(); + void setTypeInCurrentTemplate(const std::string&); Events* convertToEvents(const Common::AnnotationGraphs::AnnotationData* annotationData) const; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp index baeb1937c..1cc6cfeeb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDataXmlLogger.cpp @@ -135,7 +135,9 @@ void EventTemplateDataXmlLogger::outputEventData(std::ostream& out, i++; out << " " << endl; + << " main=\"" << (*it).isMainEvent() << "\"" + << " type=\"" << (*it).getType() << "\">" + << endl; int j=0; out << " " << endl; for(map::const_iterator it1= templateElements.begin(); it1!= templateElements.end();it1++) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp index a50d9c1ae..c1f059ae8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateDefinitionResource.cpp @@ -44,12 +26,49 @@ EventTemplateDefinitionResourceFactory(EVENTTEMPLATEDEFINITIONRESOURCE_CLASSID); //---------------------------------------------------------------------- EventTemplateDefinitionResource::EventTemplateDefinitionResource(): -m_language(0) +m_language(0), +m_templates(), +m_elementMapping() { } EventTemplateDefinitionResource::~EventTemplateDefinitionResource() { } +const std::string& EventTemplateDefinitionResource::getMention (const std::string name) const +{ +#ifdef ANTINNO_SPECIFIC + // pour éviter erreur c4172 + static std::string const mention=""; +#else + std::string mention=""; +#endif + LOGINIT("LP::EventAnalysis"); + LDEBUG << "getMention m_templates.size() " << m_templates.size(); + for(std::vector::const_iterator it=m_templates.begin();it!=m_templates.end();it++) + { + LDEBUG << "Cuurent Mention " << it->getMention()<< LENDL; + if (name.compare(it->getName())==0) return it->getMention(); + } + return mention; +} + +const std::map& EventTemplateDefinitionResource::getStructure (const std::string name) const +{ +#ifdef ANTINNO_SPECIFIC + // pour éviter erreur c4172 + static std::map const structure; +#else + std::map structure; +#endif + LOGINIT("LP::EventAnalysis"); + LDEBUG << "getMention m_templates.size() " << m_templates.size(); + for(std::vector::const_iterator it=m_templates.begin();it!=m_templates.end();it++) + { + //LDEBUG << "Cuurent Mention " << it->getMention()<< LENDL; + if (name.compare(it->getName())==0) return it->getStructure(); + } + return structure; +} //---------------------------------------------------------------------- void EventTemplateDefinitionResource:: @@ -60,25 +79,38 @@ init(GroupConfigurationStructure& unitConfiguration, LOGINIT("LP::EventAnalysis"); m_language=manager->getInitializationParameters().language; - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - EventTemplateStructure structure; // get name try { string name = unitConfiguration.getParamsValueAtKey("templateName"); structure.setName(name); + LDEBUG << "Template name = "<< name; + } catch (NoSuchParam& ) { LERROR << "No param 'templateName' in EventTemplateDefinitionResource for language " << (int)m_language; throw InvalidConfiguration(); } + try{ + + string nameMention = unitConfiguration.getParamsValueAtKey("templateMention"); + LDEBUG << "Template mention = "<< nameMention; + structure.setMention(nameMention); + } + + catch (NoSuchParam& ) { + LERROR << "No param 'templateMention' in EventTemplateDefinitionResource for language " << (int)m_language; + //throw InvalidConfiguration(); + } // get template elements: role and entity types try { map elts = unitConfiguration.getMapAtKey("templateElements"); + LDEBUG << "templateElements .size " << elts.size(); for(map::const_iterator it=elts.begin(),it_end=elts.end();it!=it_end;it++) { + LDEBUG << "templateElement =" << (*it).first; structure.addTemplateElement((*it).first,(*it).second); } } @@ -88,9 +120,11 @@ init(GroupConfigurationStructure& unitConfiguration, } // get element mapping, for template merging + LDEBUG << "get elementMapping "; try { map mapping = unitConfiguration.getMapAtKey("elementMapping"); + LDEBUG << "after Getting map "; for(map::const_iterator it=mapping.begin(),it_end=mapping.end();it!=it_end;it++) { const std::string& elements=(*it).second; // comma-separated list of elements @@ -102,10 +136,10 @@ init(GroupConfigurationStructure& unitConfiguration, } } } - catch (NoSuchParam& ) { + catch (NoSuchMap& ) { LDEBUG << "No param 'elementMapping' in EventTemplateDefinition for language " << (int)m_language; } - + LDEBUG << "Adding Structure "; m_templates.push_back(structure); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h index 7e1dd2234..72f9f5d1e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateDefinitionResource.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateDefinitionResource.h @@ -54,6 +36,8 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateDefinitionResource : public Abstrac // mapping is oriented, return 1 if mapping elt1 -> elt2, -1 if mapping elt2 -> elt1, 0 otherwise int existsMapping(const std::string& eltName1, const std::string& eltName2) const; + const std::string& getMention(const std::string) const; + const std::map& getStructure(const std::string) const; private: MediaId m_language; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp index f67ce99c8..f1d816e95 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.cpp @@ -135,8 +135,13 @@ bool AddTemplateElement::operator()(const LinguisticAnalysisStructure::AnalysisG //---------------------------------------------------------------------- CreateEventTemplate::CreateEventTemplate(MediaId language, const LimaString& complement): -Automaton::ConstraintFunction(language,complement) +Automaton::ConstraintFunction(language,complement), +m_eventType() { + LOGINIT("LP::EventAnalysis"); + LDEBUG << "Complement " << complement << LENDL; + m_eventType=Common::Misc::limastring2utf8stdstring(complement); + LDEBUG << "m_event_type " << m_eventType << LENDL; } bool CreateEventTemplate::operator()(AnalysisContent& analysis) const @@ -151,6 +156,9 @@ bool CreateEventTemplate::operator()(AnalysisContent& analysis) const LDEBUG << "CreateEventTemplate"; // validate current template by creating a new empty template which will be new current template + LDEBUG << "setTypeInCurrentTemplate" << m_eventType<setTypeInCurrentTemplate(m_eventType); + eventData->addTemplate(); return true; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h index c83e55bac..11f269418 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateFillingActions.h @@ -72,8 +72,8 @@ class LIMA_EVENTANALISYS_EXPORT CreateEventTemplate : public Automaton::Constrai ~CreateEventTemplate() {} bool operator()(AnalysisContent& analysis) const; - //bool actionNeedsRecognizedExpression() { return true; } private: + std::string m_eventType; }; class LIMA_EVENTANALISYS_EXPORT ClearEventTemplate : public Automaton::ConstraintFunction diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp index 6d773c9bc..0407a6f3b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.cpp @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateStructure.cpp @@ -47,7 +29,7 @@ void EventTemplateStructure::addTemplateElement(const std::string& role, { if (m_structure.find(role)!=m_structure.end()) { LOGINIT("LP::EventAnalysis"); - LERROR << "In event " << m_name << ", element '"<< role <<"' is defined twice" ; + LERROR << "In event " << m_name << ", element '"<< role <<"' is defined twice" << LENDL; } else { Common::MediaticData::EntityType type= diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h index 4fe352bf0..8e5d09aff 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/EventAnalysis/EventTemplateStructure.h @@ -1,21 +1,3 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ /************************************************************************ * * @file EventTemplateStructure.h @@ -49,15 +31,18 @@ class LIMA_EVENTANALISYS_EXPORT EventTemplateStructure ~EventTemplateStructure(); void setName(const std::string& name) { m_name=name; } + void setMention(const std::string& name) { m_mention=name; } void addTemplateElement(const std::string& role, const std::string entityType); const std::string& getName(void) const { return m_name; } + const std::string& getMention(void) const { return m_mention; } const std::map& getStructure(void) const { return m_structure; } private: std::string m_name; std::map m_structure; + std::string m_mention; }; } // end namespace diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp index ea6caa1f0..1cb913865 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Automaton.cpp @@ -34,6 +34,7 @@ #include "State.h" #include "common/misc/Exceptions.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" @@ -77,10 +78,8 @@ void Automaton::init( MediaId language=manager->getInitializationParameters().language; try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string charChartFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("automatonFile"); - loadFromFile(charChartFileName); - + QString charChartFileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("automatonFile").c_str()); + loadFromFile(charChartFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { LERROR << "no parameter 'automatonFile' in tokenizer group for language " << (int) language << " !"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp index 3daeddb77..a33fde0e8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/CharChart.cpp @@ -33,6 +33,7 @@ #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include @@ -150,9 +151,8 @@ void CharChart::init( MediaId language=manager->getInitializationParameters().language; try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string charChartFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("charFile"); - loadFromFile(charChartFileName); + QString charChartFileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("charFile").c_str()); + loadFromFile(charChartFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp index 8ce59fa70..062d34b4a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/Tokenizer.cpp @@ -35,6 +35,7 @@ #include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" #include @@ -111,10 +112,9 @@ void Tokenizer::init( try { - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string fileName=resourcesPath +"/"+unitConfiguration.getParamsValueAtKey("automatonFile"); + QString fileName=Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("automatonFile").c_str()); m_d->_automaton.setCharChart(m_d->_charChart); - m_d->_automaton.loadFromFile(fileName); + m_d->_automaton.loadFromFile(fileName.toUtf8().constData()); } catch (NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp index 57fed9350..e7138b78b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/FlatTokenizer/TokenizerAutomaton.cpp @@ -84,50 +84,6 @@ void TokenizerAutomaton::init( } m_text=new Text(_language,_charChart); - - try - { - string resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath(); - string fileName=resourcesPath +"/"+unitConfiguration.getParamsValueAtKey("automatonFile"); - - } - catch (NoSuchParam& ) - { - LERROR << "no param 'automatonFile' in TokenizerAutomaton group configuration (language=" - << (int) _language << ")"; - throw InvalidConfiguration(); - } - // when input XML file is syntactically wrong - catch (XmlSyntaxException exc) - { - std::ostringstream mess; - mess << "XmlSyntaxException at line "< microFilters, LinguisticGraphVertex end) { +#ifdef DEBUG_LP + LASLOGINIT; +#endif /* * Algorithm: we're using a Breadth First Search and keep track of the * "thickness" of the lattice, and only stop if both condition apply: * 1/ the thickness is 1, meaning that every path goes through this node * 2/ the node is in microFilters (eg. a full stop in english) + * OR the node has t_sentence_break tokenization status */ std::set visited; LinguisticGraphOutEdgeIt outItr,outItrEnd; @@ -225,6 +229,7 @@ LinguisticGraphVertex AnalysisGraph::nextMainPathVertex( toVisit.push(target(*outItr,*m_graph)); } + VertexTokenPropertyMap tokenMap = get( vertex_token, *m_graph ); // search while (!toVisit.empty()) { @@ -235,13 +240,27 @@ LinguisticGraphVertex AnalysisGraph::nextMainPathVertex( { return end; } + Token* ft = tokenMap[current]; accumulator-=in_degree(current,*m_graph); if (accumulator==0) { // check unique category only if accumulator is 0 MorphoSyntacticData* msd=get(vertex_data,*m_graph,current); - if (msd!=0 && msd->hasUniqueMicro(microAccessor,microFilters)) return current; + if (msd!=0 && msd->hasUniqueMicro(microAccessor,microFilters)) + { +#ifdef DEBUG_LP + LDEBUG << "AnalysisGraph::nextMainPathVertex micro, return" << current; +#endif + return current; + } + if (ft && ft->status().getStatus() == T_SENTENCE_BRK) + { +#ifdef DEBUG_LP + LDEBUG << "AnalysisGraph::nextMainPathVertex sentence break, return" << current; +#endif + return current; + } } accumulator+=out_degree(current,*m_graph); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h index f10b7210a..bbdb8d4c7 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h @@ -38,7 +38,7 @@ #include #include "common/misc/depth_first_searchnowarn.hpp" #include -#include +#include //========== defines diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp index 7f0ea1d6d..f49a267c9 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.cpp @@ -47,7 +47,8 @@ m_out(0), m_handlerName(), m_outputFile(), m_outputSuffix(), -m_append(false) +m_append(false), +m_temporaryFileMetadata() { } @@ -69,6 +70,12 @@ void AbstractTextualAnalysisDumper::init( } catch (NoSuchParam& ) { } // do nothing, optional + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + try { m_outputSuffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); @@ -119,9 +126,23 @@ initialize(AnalysisContent& analysis) const } } + if (! m_temporaryFileMetadata.isEmpty()) { +#ifdef DEBUG_LP + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with temporary file metadata"; +#endif + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + LERROR << "no LinguisticMetaData ! abort"; + } +#ifdef DEBUG_LP + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with metadata value"<< metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); +#endif + return new DumperStream(metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()),m_append); + } + if (! m_outputFile.empty()) { #ifdef DEBUG_LP - LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile; + LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file"<< m_outputFile << m_append; #endif return new DumperStream(m_outputFile,m_append); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h index 617fdc48d..9438ab56a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h @@ -69,6 +69,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT AbstractTextualAnalysisDumper : public Me std::string m_outputFile; /* < the file name for local file logging */ std::string m_outputSuffix; /* < the suffix for local file logging */ bool m_append; + QString m_temporaryFileMetadata; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp index af07d5fb2..52cb5be63 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.cpp @@ -1,3 +1,153 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file AnalysisLoader.cpp + * @author Romaric Besancon (romaric.besancon@cea.fr) + * @date Tue Jan 18 2011 + * copyright Copyright (C) 2011 by CEA LIST + * + ***********************************************************************/ + +#include "AnalysisLoader.h" + +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +namespace Lima { +namespace LinguisticProcessing { + +SimpleFactory AnalysisLoaderFactory(ANALYSISLOADER_CLASSID); + +//*********************************************************************** +// constructors and destructors +AnalysisLoader::AnalysisLoader(): +MediaProcessUnit(), +m_inputFileName(), +m_inputFileExtension(), +m_temporaryFileMetadata() +{ +} + +AnalysisLoader::~AnalysisLoader() { +} + +//*********************************************************************** +void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* /*manager*/) + +{ + LOGINIT("LP::AnalysisLoader"); + LDEBUG << "Initialization"; + + bool parameterFound(false); + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + + try { + m_inputFileName=unitConfiguration.getParamsValueAtKey("inputFile"); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + } + + try { + m_inputFileExtension=unitConfiguration.getParamsValueAtKey("inputSuffix"); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + } + + if (! parameterFound) { + LERROR << "No 'inputFile' or 'inputSuffix' or 'temporaryFileMetadata' parameter in AnalysisLoader"; + throw InvalidConfiguration(); + } + +} + +const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const +{ + static std::string inputFile(""); + if (! m_temporaryFileMetadata.isEmpty()) { + // get temporary filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'temporaryFileMetadata' parameter for AnalysisLoader"; + return inputFile; + } + + inputFile = metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); + return inputFile; + } + else if (! m_inputFileName.empty()) { + return m_inputFileName; + } + else if (! m_inputFileExtension.empty()) { + // get filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'inputSuffix' parameter for AnalysisLoader"; + return inputFile; + } + + std::string textFileName = metadata->getMetaData("FileName"); + inputFile = textFileName + m_inputFileExtension; + return inputFile; + } + LOGINIT("LP::AnalysisLoader"); + LERROR << "No 'inputFile' found in AnalysisLoader"; + return inputFile; +} + + +} // end namespace +} // end namespace + + + + +#else + + + +// version master + + + /* Copyright 2002-2013 CEA LIST @@ -40,7 +190,8 @@ SimpleFactory AnalysisLoaderFactory(ANALYSISLOA AnalysisLoader::AnalysisLoader(): MediaProcessUnit(), m_inputFileName(), -m_inputFileExtension() +m_inputFileExtension(), +m_temporaryFileMetadata() { } @@ -56,6 +207,13 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc LDEBUG << "Initialization"; bool parameterFound(false); + try + { + m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + parameterFound=true; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) {} // keep default value (empty) + try { m_inputFileName=unitConfiguration.getParamsValueAtKey("inputFile"); parameterFound=true; @@ -71,7 +229,7 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc } if (! parameterFound) { - LERROR << "No 'inputFile' or 'inputSuffix' parameter in AnalysisLoader"; + LERROR << "No 'inputFile' or 'inputSuffix' or 'temporaryFileMetadata' parameter in AnalysisLoader"; throw InvalidConfiguration(); } @@ -80,7 +238,20 @@ void AnalysisLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStruc const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const { static std::string inputFile(""); - if (! m_inputFileName.empty()) { + if (! m_temporaryFileMetadata.isEmpty()) { + // get temporary filename from metadata + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LOGINIT("LP::AnalysisLoader"); + LERROR << "no LinguisticMetaData : cannot use 'temporaryFileMetadata' parameter for AnalysisLoader"; + return inputFile; + } + + inputFile = metadata->getMetaData(m_temporaryFileMetadata.toUtf8().constData()); + return inputFile; + } + else if (! m_inputFileName.empty()) { return m_inputFileName; } else if (! m_inputFileExtension.empty()) { @@ -105,3 +276,7 @@ const std::string& AnalysisLoader::getInputFile(AnalysisContent& analysis) const } // end namespace } // end namespace + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h index b46dddc44..2cf11d21b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h @@ -1,3 +1,98 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file AnalysisLoader.h + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * Project MM + * + * @brief abstract class for analysis loaders + * + * + ***********************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSING_ANALYSISLOADER_H +#define LIMA_LINGUISTICPROCESSING_ANALYSISLOADER_H + +#include "LinguisticProcessorsExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include +#include + +namespace Lima { +namespace LinguisticProcessing { + +#define ANALYSISLOADER_CLASSID "AnalysisLoader" + +/* + * @brief this is the abstract class for analysis loaders, that read + * informations from external files to insert them in the analysis + * data + */ +class LIMA_LINGUISTICPROCESSORS_EXPORT AnalysisLoader : public MediaProcessUnit +{ +public: + AnalysisLoader(); + + virtual ~AnalysisLoader(); + + void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& /*analysis*/) const { return SUCCESS_ID; } + + const std::string& getInputFile(AnalysisContent& analysis) const; + +protected: + std::string m_inputFileName; + std::string m_inputFileExtension; + QString m_temporaryFileMetadata; +}; + +} // end namespace +} // end namespace + +#endif + + + + +#else + + +// version master + + + + /* Copyright 2002-2013 CEA LIST @@ -64,9 +159,14 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT AnalysisLoader : public MediaProcessUnit protected: std::string m_inputFileName; std::string m_inputFileExtension; + QString m_temporaryFileMetadata; }; } // end namespace } // end namespace #endif + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp index baf70183c..c64202cfc 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.cpp @@ -1,3 +1,11 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + /* Copyright 2002-2013 CEA LIST @@ -238,3 +246,215 @@ LimaStatusCode ExternalProcessUnit::process(AnalysisContent& analysis) const } // end namespace } // end namespace + + + +#else + + +// version master + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file ExternalProcessUnit.cpp + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * + ***********************************************************************/ + +#include "ExternalProcessUnit.h" + +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/time/traceUtils.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" + +//#include "boost/process.hpp" +#include + +#include + +//namespace bp = ::boost::process; + +using namespace std; + +namespace Lima { +namespace LinguisticProcessing { + +SimpleFactory ExternalProcessUnitFactory(EXTERNALPROCESSUNIT_CLASSID); + +ExternalProcessUnit::ExternalProcessUnit(): +MediaProcessUnit(), +m_dumper(), +m_loader(), +m_commandLine(), +m_inputSuffix(), +m_outputSuffix() +{ +} + +ExternalProcessUnit::~ExternalProcessUnit() +{ +} + + +void ExternalProcessUnit::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + LOGINIT("LP::External"); + LDEBUG << "Initialization"; + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'dumper' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'loader' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + try { + m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value + } + + try { + m_commandLine=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("command").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + LERROR << "Missing 'command' parameter in ExternalProcessUnit group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } +} + +LimaStatusCode ExternalProcessUnit::process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + LOGINIT("LP::External"); + LINFO << "ExternalProcessUnit: start"; + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + + LimaStatusCode returnCode(SUCCESS_ID); + + // produce temporary file with the given dumper + LDEBUG << "ExternalProcessUnit: write tmp file"; + returnCode=m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "ExternalProcessUnit: failed to dump data to temporary file"; + return returnCode; + } + + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename, outputFilename; + // apply command line + LDEBUG << "ExternalProcessUnit: apply external program"; + QString commandLine = m_commandLine; + if (!m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_inputSuffix; + } + if (!m_outputSuffix.isEmpty()) + { + outputFilename = fileName + m_outputSuffix; + } + commandLine = commandLine.arg(inputFilename).arg(outputFilename); + LDEBUG << "Launching " << commandLine; + int processResult = QProcess::execute(commandLine); + switch (processResult) { + case -2 : + LERROR << "ExternalProcessUnit: Was not able to start '" << commandLine << "'" ; + return returnCode; + case -1 : + LERROR << "ExternalProcessUnit: '" << commandLine << "' crashed!"; + return returnCode; + case 0 : + break; + default: + LERROR << "ExternalProcessUnit: '" << commandLine << "' returned error status:" << processResult; + return returnCode; + } + + if (m_loader != 0) { + // load results from the external program with the given loader + LDEBUG << "ExternalProcessUnit: read results"; + returnCode=m_loader->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "ExternalProcessUnit: failed to load data from temporary file"; + return returnCode; + } + } + else { + LWARN << "ExternalProcessUnit: no loader defined for the current external process unit"; + } + + TimeUtils::logElapsedTime("ExternalProcessUnit"); + return returnCode; +} + + +} // end namespace +} // end namespace + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h index 0c09a7cbc..5ceb62b45 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/ExternalProcessUnit.h @@ -1,3 +1,12 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + /* Copyright 2002-2013 CEA LIST @@ -77,3 +86,94 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT ExternalProcessUnit : public MediaProcess } // end namespace #endif + + + + +#else + + + + +// version master + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file externalProcessUnit.h + * @author besancon (besanconr@zoe.cea.fr) + * @date Mon Jan 17 2011 + * copyright Copyright (C) 2011 by CEA LIST (LVIC) + * Project MM + * + * @brief this class contains a generic process unit that use a system call + * to let an external process do the job. + * + * + ***********************************************************************/ +#ifndef LIMA_LINGUISTICPROCESSING_EXTERNALPROCESSUNIT_H +#define LIMA_LINGUISTICPROCESSING_EXTERNALPROCESSUNIT_H + +#include "LinguisticProcessorsExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "linguisticProcessing/client/AnalysisHandlers/AbstractTextualAnalysisHandler.h" + +namespace Lima { +namespace LinguisticProcessing { + +#define EXTERNALPROCESSUNIT_CLASSID "ExternalProcessUnit" + +/* + * @brief this class contains a generic process unit that use a system + * call to let an external process do the job. The input for this + * external program is produced by a dumper given as a parameter, + * and the output is read by a Loader also given as a parameter + */ +class LIMA_LINGUISTICPROCESSORS_EXPORT ExternalProcessUnit : public MediaProcessUnit +{ +public: + ExternalProcessUnit(); + + virtual ~ExternalProcessUnit(); + + void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& analysis) const; + +private: + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_commandLine; + QString m_inputSuffix; + QString m_outputSuffix; +}; + +} // end namespace +} // end namespace + +#endif + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp index 36a99aa9f..8107e9467 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.cpp @@ -29,6 +29,7 @@ #include "LinguisticMetaData.h" #include "LimaStringText.h" +#include "common/Data/strwstrtools.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/time/traceUtils.h" @@ -46,12 +47,13 @@ namespace LinguisticProcessing SimpleFactory statusLoggerFactory(STATUSLOGGER_CLASSID); -StatusLogger::StatusLogger() +StatusLogger::StatusLogger() {} StatusLogger::~StatusLogger() -{} +{ +} void StatusLogger::init( @@ -75,7 +77,7 @@ void StatusLogger::init( { outputFile=string("status.log"); } - m_out= new ofstream(outputFile.c_str(), std::ofstream::binary); + m_out = std::unique_ptr< std::ofstream >(new ofstream(outputFile.c_str(), std::ofstream::binary)); try { deque tolog=unitConfiguration.getListsValueAtKey("toLog"); @@ -118,7 +120,7 @@ LimaStatusCode StatusLogger::process( string line; while (!statusIn.eof()) { - getline(statusIn,line); + line = Lima::Common::Misc::readLine(statusIn); size_t index=line.find(":"); string key=line.substr(0,index); if (m_toLog.find(key)!=m_toLog.end()) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h index bb2b2873b..98d35fb25 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticProcessors/StatusLogger.h @@ -61,7 +61,7 @@ class LIMA_LINGUISTICPROCESSORS_EXPORT StatusLogger : public MediaProcessUnit private: - std::ostream* m_out; + std::unique_ptr< std::ofstream > m_out; std::set m_toLog; std::string m_statusFile; uint64_t m_predTime; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp index 720040ab1..378f40d5f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/AbstractResource.cpp @@ -37,7 +37,10 @@ friend class AbstractResource; LimaFileSystemWatcher m_resourceFileWatcher; }; -AbstractResource::AbstractResource( QObject* parent ) : QObject( parent ), m_d(new AbstractResourcePrivate()) +AbstractResource::AbstractResource( QObject* parent ) : + QObject( parent ), + InitializableObject(), + m_d(new AbstractResourcePrivate()) { connect(&m_d->m_resourceFileWatcher,SIGNAL(fileChanged(QString)),this,SIGNAL(resourceFileChanged(QString))); } @@ -47,7 +50,10 @@ AbstractResource::~AbstractResource() delete m_d; } -AbstractResource::AbstractResource(const AbstractResource& r) : QObject(r.parent()), m_d(new AbstractResourcePrivate(*r.m_d)) +AbstractResource::AbstractResource(const AbstractResource& r) : + QObject(r.parent()), + InitializableObject(), + m_d(new AbstractResourcePrivate(*r.m_d)) { connect(&m_d->m_resourceFileWatcher,SIGNAL(fileChanged(QString)),this,SIGNAL(resourceFileChanged(QString))); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp index 25ff78cd8..b3da33aea 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/LinguisticResources/LinguisticResources.cpp @@ -27,8 +27,11 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "common/AbstractFactoryPattern/Singleton.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/core/AnalysisDict/AbstractAccessResource.h" +#include + using namespace std; using namespace Lima::Common::XMLConfigurationFiles; @@ -153,8 +156,21 @@ includeResources(Common::XMLConfigurationFiles::ModuleConfigurationStructure& mo #ifdef DEBUG_LP LDEBUG << "i="<< i; #endif - fileName=Common::MediaticData::MediaticData::single().getConfigPath()+ - "/"+string((*it),0,i); + QStringList configPaths = QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()).split(LIMA_PATH_SEPARATOR); + Q_FOREACH(QString confPath, configPaths) + { + if (QFileInfo(confPath + "/" + string((*it),0,i).c_str()).exists()) + { + + fileName = (confPath + "/" + string((*it),0,i).c_str()).toUtf8().constData(); + break; + } + } + if (fileName.empty()) + { + LERROR << "No resources" << *it << "found in" << Common::MediaticData::MediaticData::single().getConfigPath(); + continue; + } moduleName=string((*it),i+1); LINFO << "includeResources filename="<< fileName << "moduleName="<< moduleName; XMLConfigurationFileParser parser(fileName); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp index 1043fc1bc..ec0c5f90a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/Modex/Modex.cpp @@ -30,6 +30,7 @@ #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" #include "common/MediaProcessors/MediaProcessors.h" // #include "linguisticProcessing/common/linguisticData/linguisticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/time/traceUtils.h" @@ -64,7 +65,7 @@ void Modex::init(GroupConfigurationStructure& unitConfiguration, try { // try to get a single automaton string filename=unitConfiguration.getParamsValueAtKey("modexConfig"); - string configFile=LinguisticData::single().getConfigPath()+"/"+filename; + string configFile=Common::Misc::findFileInPaths(LinguisticData::single().getConfigPath().c_str(),filename.c_str()).toUtf8().constData(); initModex(configFile,m_language); } @@ -156,7 +157,7 @@ addConfiguration(ModuleConfigurationStructure& modexConfig, void Modex:: initEntities(const std::string& filename) { - XMLConfigurationFileParser configuration(LinguisticData::single().getConfigPath() + "/" + filename); + XMLConfigurationFileParser configuration(Common::Misc::findFileInPaths(LinguisticData::single().getConfigPath().c_str(),filename.c_str()).toUtf8().constData()); initEntities(configuration); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp index 524d1ba7d..65702f83c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.cpp @@ -77,8 +77,19 @@ namespace MorphologicAnalysis SimpleFactory abbreviationSplitAlternativesFactory(ABBREVIATIONSPLITALTERNATIVESFACTORY_CLASSID); AbbreviationSplitAlternatives::AbbreviationSplitAlternatives() : - m_reader(0) -{} +m_tokenizer(0), +m_dictionary(0), +m_abbreviations(), +m_language(), +m_confidentMode(true), +m_reader(0), +m_charSplitRegexp() +{ + // default split regexp: split on simple quote or UTF-8 right quotation mark + LimaString quotes=Common::Misc::utf8stdstring2limastring("['’]"); + m_charSplitRegexp=QRegExp(quotes); + +} AbbreviationSplitAlternatives::~AbbreviationSplitAlternatives() { @@ -156,6 +167,19 @@ void AbbreviationSplitAlternatives::init( LWARN << "use default value : 'true'"; m_confidentMode=true; } + + try + { + string charSplit=unitConfiguration.getParamsValueAtKey("charSplitRegexp"); + m_charSplitRegexp=QRegExp(Common::Misc::utf8stdstring2limastring(charSplit)); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in AbbreviationSplitAlternatives group for language " << (int) m_language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); m_reader=new AlternativesReader(m_confidentMode,true,true,true,charChart,sp); @@ -168,6 +192,9 @@ LimaStatusCode AbbreviationSplitAlternatives::process( MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process AbbreviationSplitAlternatives"; +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); LinguisticGraph* graph=tokenList->getGraph(); @@ -193,6 +220,14 @@ LimaStatusCode AbbreviationSplitAlternatives::process( boost::tie(it, it_end) = vertices(*graph); for (; it != it_end; it++) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + MORPHOLOGINIT + LERROR << "Analyze too long. Stopped in AbbreviationSplitAlternatives"; + return TIME_OVERFLOW; + } +#endif MorphoSyntacticData* currentData = dataMap[*it]; if (currentData == 0) continue; Token* currentToken= tokenMap[*it]; @@ -268,8 +303,12 @@ bool AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternative const LimaString& ft = ftok->stringForm(); LDEBUG << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor " << Common::Misc::limastring2utf8stdstring(ft); - int aposPos = ft.indexOf(Common::Misc::utf8stdstring2limastring("'"), 0); - if (aposPos==-1 || aposPos==0) return false; + //int aposPos = ft.indexOf(Common::Misc::utf8stdstring2limastring("'"), 0); + int aposPos = ft.indexOf(m_charSplitRegexp, 0); + //LDEBUG << "AbbreviationSplitAlternatives: split chars found at " << aposPos; + if (aposPos==-1 || aposPos==0) { + return false; + } LimaString beforeAbbrev(ft.left(aposPos-1)); std::vector< LimaString >::const_iterator itAbb = m_abbreviations.begin(); @@ -357,7 +396,14 @@ bool AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternative { LERROR << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor: Cannot find a dictionary entry for abbreviated word " << Lima::Common::Misc::limastring2utf8stdstring(abbrev); } - + if (newData->empty()) + { + MORPHOLOGINIT; + LERROR << "AbbreviationSplitAlternatives::makeConcatenatedAbbreviationSplitAlternativeFor Got empty morphosyntactic data. Abort."; + delete newFT; + delete newData; + return false; + } // LinguisticGraphVertex afterVertex = listIterator.createVertexFor(newFT); LinguisticGraphVertex afterVertex = add_vertex(*graph); put(vertex_token,*graph,afterVertex,newFT); @@ -400,7 +446,8 @@ bool AbbreviationSplitAlternatives::makePossessiveAlternativeFor( const LimaString& ft = ftok->stringForm(); LDEBUG << "AbbreviationSplitAlternatives::makePossessiveAlternativeFor " << Common::Misc::limastring2utf8stdstring(ft); - int aposPos = ft.indexOf(LimaChar('\''), 0); + //int aposPos = ft.indexOf(LimaChar('\''), 0); + int aposPos = ft.indexOf(m_charSplitRegexp, 0); if (aposPos==-1 || aposPos==0) return false; LimaString possessivedWord(ft.left(aposPos)); LDEBUG << "AbbreviationSplitAlternatives::makePossessiveAlternativeFor possesive word: " << Common::Misc::limastring2utf8stdstring(possessivedWord); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h index 2426f91f3..1b3a84808 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/AbbreviationSplitAlternatives.h @@ -86,6 +86,7 @@ class LIMA_MORPHOLOGICANALYSIS_EXPORT AbbreviationSplitAlternatives : public Med MediaId m_language; bool m_confidentMode; AlternativesReader* m_reader; + QRegExp m_charSplitRegexp; bool makeConcatenatedAbbreviationSplitAlternativeFor( LinguisticGraphVertex splitted, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt index ede2e7344..bc566d198 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/CMakeLists.txt @@ -43,6 +43,13 @@ SET(lima-lp-morphologicanalysis_LIB_SRCS ) endif (ENCHANT_FOUND) +if (HUNSPELL_FOUND) +SET(lima-lp-morphologicanalysis_LIB_SRCS + ${lima-lp-morphologicanalysis_LIB_SRCS} + HunspellSpellingAlternatives.cpp +) +endif () + DECLARE_LIMA_PLUGIN(lima-lp-morphologicanalysis) target_link_libraries(lima-lp-morphologicanalysis diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp index 5465fcceb..7907b1236 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/ConcatenatedDataHandler.cpp @@ -77,11 +77,20 @@ void ConcatenatedDataHandler::foundComponent(uint64_t position, uint64_t length, } concatenated.push_back(componentVertex); +#ifdef ANTINNO_SPECIFIC + // FWI 04/04/2016 + // plantage de l'indexeur sauf si les 4 lignes suivantes sont commentées + // apparement le fait de détruire *this plante nt.dll sans que la cause soit évidente + // pour test : voir le doc "constitution 2011" en ARA sur la machine "lirac" + // sur ma machine ça ne plante pas systématiquement... + // A noter : désactiver le paramètre "parseConcatenated" dans SimpleWord permet de courtcircuiter le problème + // -> à investiguer +#endif m_currentToken=new Token(form,(*m_stringsPool)[form],m_srcToken->position()+position,length,m_srcToken->status()); put(vertex_token,*m_graph,componentVertex,m_currentToken); m_currentData=new MorphoSyntacticData(); put(vertex_data,*m_graph,componentVertex,m_currentData); - + m_currentElement.inflectedForm=form; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp index 10571135b..cdfd130c5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DefaultProperties.cpp @@ -33,6 +33,7 @@ #include "common/MediaticData/mediaticData.h" #include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "linguisticProcessing/core/FlatTokenizer/CharChart.h" @@ -73,8 +74,8 @@ void DefaultProperties::init( std::deque skipUnmarkStatus; try { - string file=Common::MediaticData::MediaticData::single().getResourcesPath() + "/" + unitConfiguration.getParamsValueAtKey("defaultPropertyFile"); - readDefaultsFromFile(file); + QString file = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(), unitConfiguration.getParamsValueAtKey("defaultPropertyFile").c_str()); + readDefaultsFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -189,7 +190,7 @@ void DefaultProperties::readDefaultsFromFile(const std::string& filename) string type; LinguisticCode props; while (fin.good() && !fin.eof()) { - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); if (line.size()>0) { istringstream is(line); is >> type; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp index 525b2855f..3bd18b633 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/DesagglutinationResources.cpp @@ -28,6 +28,7 @@ #include "DesagglutinationResources.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include "linguisticProcessing/common/PropertyCode/PropertyManager.h" @@ -61,8 +62,8 @@ void DesagglutinationResources::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - string file=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("categoriesMappingFile"); - loadMicroCategoriesMappingFromFile(file); + QString file = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("categoriesMappingFile").c_str()); + loadMicroCategoriesMappingFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { @@ -71,8 +72,8 @@ void DesagglutinationResources::init( } try { - string file=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("delimiterFile"); - loadDelimitersFromFile(file); + QString file=Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("delimiterFile").c_str()); + loadDelimitersFromFile(file.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp new file mode 100644 index 000000000..921b93ff1 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.cpp @@ -0,0 +1,263 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "HunspellSpellingAlternatives.h" +#include "MorphoSyntacticDataHandler.h" + +#include "common/time/traceUtils.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/AnalysisDict/AbstractAnalysisDictionary.h" + +#include +#include + +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::AnalysisDict; +using namespace Lima::LinguisticProcessing::FlatTokenizer; +using namespace Lima::Common::XMLConfigurationFiles; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + + SimpleFactory hunspellSpellingAlternativesFactory(HUNSPELL_SPELLING_ALTERNATIVES_CLASSID); + + +class HunspellSpellingAlternativesPrivate +{ + friend class HunspellSpellingAlternatives; + +public: + HunspellSpellingAlternativesPrivate() : m_hunspell(0), m_bestOnly(false) {} + virtual ~HunspellSpellingAlternativesPrivate() {delete m_hunspell;} + + + void setHunspellSpellingAlternatives( + LinguisticAnalysisStructure::Token* token, + LinguisticAnalysisStructure::MorphoSyntacticData* tokenData, + FsaStringsPool& sp); + + AnalysisDict::AbstractAnalysisDictionary* m_dictionary; + MediaId m_language; + Hunspell* m_hunspell; + bool m_bestOnly; +}; + + +HunspellSpellingAlternatives::HunspellSpellingAlternatives() : m_d(new HunspellSpellingAlternativesPrivate()) +{ + +} + +HunspellSpellingAlternatives::~HunspellSpellingAlternatives() +{ + delete m_d; +} + +void HunspellSpellingAlternatives::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + MORPHOLOGINIT; + LDEBUG << "HunspellSpellingAlternatives::init"; std::string spellDico; + m_d->m_language = manager->getInitializationParameters().media; + + try + { + // try to get a specific spellchecking dictionary name from the config file + spellDico = unitConfiguration.getParamsValueAtKey("spellcheckDictionary"); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'spellcheckDictionary' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language; + throw InvalidConfiguration(); + } + LDEBUG << "HunspellSpellingAlternatives::init requesting Hunspell spellcheck dictionary" << Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/" << spellDico; + if (m_d->m_hunspell != 0) delete m_d->m_hunspell; + m_d->m_hunspell = new Hunspell( (Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"+spellDico+".aff").c_str(), + (Common::MediaticData::MediaticData::changeable().getResourcesPath()+"/Spellchecking/"+spellDico+".dic").c_str() ); + try + { + std::string dico=unitConfiguration.getParamsValueAtKey("dictionary"); + AbstractResource* res= LinguisticResources::single().getResource(m_d->m_language,dico); + m_d->m_dictionary=static_cast(res); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language; + throw InvalidConfiguration(); + } + try + { + // try to get a specific spellchecking dictionary name from the config file + m_d->m_bestOnly = unitConfiguration.getBooleanParameter("bestOnly"); + } + catch (NoSuchParam& ) + { + LNOTICE << "no param 'bestOnly' in HunspellSpellingAlternatives group for language " << (int) m_d->m_language << ". Use default"; + } +} + + +LimaStatusCode HunspellSpellingAlternatives::process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + MORPHOLOGINIT; + LINFO << "MorphologicalAnalysis: starting process HunspellSpellingAlternatives"; + + FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(m_d->m_language); + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + LinguisticGraph* g=tokenList->getGraph(); + VertexDataPropertyMap dataMap=get(vertex_data,*g); + VertexTokenPropertyMap tokenMap=get(vertex_token,*g); + + LinguisticGraphVertex firstVx = tokenList->firstVertex(); + LinguisticGraphVertex lastVx = tokenList->lastVertex(); + + std::set< std::string > alreadyStored; + std::set visited; + //std::set alreadyStoredVertices; compatibilite 32 64 bits + std::set alreadyStoredVertices; + + std::queue toVisit; + toVisit.push(firstVx); + + while (!toVisit.empty()) + { + LinguisticGraphVertex v=toVisit.front(); +#ifdef DEBUG_LP + LDEBUG << "BowDumper::addVerticesToBoWText visiting" << v; +#endif + + toVisit.pop(); + if (v == lastVx) { + continue; + } + + LinguisticGraphOutEdgeIt outItr,outItrEnd; + for (boost::tie(outItr,outItrEnd)=out_edges(v,*g); + outItr!=outItrEnd; + outItr++) + { + LinguisticGraphVertex next=target(*outItr,*g); + if (visited.find(next)==visited.end()) + { + visited.insert(next); + toVisit.push(next); + } + } + + if (v != firstVx && v != lastVx) + { + LDEBUG << "HunspellSpellingAlternatives::process processing vertex " << v; + Token* currentToken=tokenMap[v]; + MorphoSyntacticData* msd=dataMap[v]; + + if (currentToken!=0) + { + if (msd->empty()) + { + m_d->setHunspellSpellingAlternatives( + currentToken, + msd, + sp); + } + } + } + } + + + LINFO << "MorphologicalAnalysis: ending process HunspellSpellingAlternatives"; + return SUCCESS_ID; +} + +void HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives( + Token* token, + MorphoSyntacticData* tokenData, + FsaStringsPool& sp) +{ + // try to find simple Uncapitalization + MORPHOLOGINIT; + // FIXME Conditions below could be process unit parameters + const LimaString& tokenStr=token->stringForm(); + if (token->status().getAlphaCapital() == T_CAPITAL + || token->status().getAlphaCapital() == T_CAPITAL_1ST + || token->status().getAlphaCapital() == T_CAPITAL_SMALL + || token->status().isAlphaConcatAbbrev() + || token->status().isAlphaHyphen() + || token->status().isAlphaPossessive() + || tokenStr.toUpper() == tokenStr + || token->status().defaultKey() == "t_url") + { + return; + } + char **suggestions; + int suggestResult = m_hunspell->suggest(&suggestions, tokenStr.toUtf8().constData()); + if (suggestResult > 1 && m_bestOnly) + { + suggestResult = 1; + } + for (int i = 0; i < suggestResult; i++) + { + LimaString correction = LimaString::fromUtf8(suggestions[i]); + // FIXME Conditions below could be process unit parameters + if ( correction.size() > 1 && correction != tokenStr ) + { + LDEBUG << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives trying to correct" << tokenStr << "into" << correction; + DictionaryEntry entry (m_dictionary->getEntry(correction)); + MorphoSyntacticDataHandler lingInfosHandler(*tokenData, SPELLING_ALTERNATIVE); + + +// if (!entry.isEmpty()) + { + LINFO << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correcting" << tokenStr << "into" << correction << "at" << token->position(); + // add orthographic alternative to Token; + StringsPoolIndex idx=sp[correction]; + token->addOrthographicAlternatives(idx); + + if (entry.hasLingInfos()) + { + entry.parseLingInfos(&lingInfosHandler); + } + } +// else +// { +// LDEBUG << "HunspellSpellingAlternativesPrivate::setHunspellSpellingAlternatives correction" << correction << "not found in the dictionary"; +// delete entry; +// } + } + } + m_hunspell->free_list(&suggestions, suggestResult); +} + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima + + + diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h new file mode 100644 index 000000000..b5f085634 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HunspellSpellingAlternatives.h @@ -0,0 +1,72 @@ +/* + Copyright 2015 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#ifndef HUNSPELL_SPELLING_ALTERNATIVES_H +#define HUNSPELL_SPELLING_ALTERNATIVES_H + +#include "common/Data/LimaString.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace FlatTokenizer +{ + class CharChart; +} +namespace LinguisticAnalysisStructure +{ + class MorphoSyntacticData; +} +namespace AnalysisDict +{ + class AbstractAnalysisDictionary; +} +namespace MorphologicAnalysis +{ + +#define HUNSPELL_SPELLING_ALTERNATIVES_CLASSID "HunspellSpellingAlternatives" +class HunspellSpellingAlternativesPrivate; +class HunspellSpellingAlternatives : public MediaProcessUnit { + +public: + HunspellSpellingAlternatives(); + virtual ~HunspellSpellingAlternatives(); + + void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager); + + LimaStatusCode process( + AnalysisContent& analysis) const; + +private: + HunspellSpellingAlternativesPrivate* m_d; + +}; + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima + + +#endif // HUNSPELL_SPELLING_ALTERNATIVES_H diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp index b61398322..0cf475de5 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/HyphenWordAlternatives.cpp @@ -1,342 +1,354 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/** - * @brief HyphenWordAlternatives is the module which creates split alternatives - * for hyphen word tokens. Each token from the supplied tokens path is processed : - * o FullToken must be "AlphaHyphen" typed by Tokenizer. - * o If a token has a single word entry or an orthographic alternative - * it is not decomposed - * o Token is break at hyphen boundaries and a new alternative path is created - * o each FullToken of the new Path is searched into dictionnary as Simple Word - * o If special hyphen entry, no alternatives are searched, - * otherwise Accented alternatives are searched - * o Path is valid even if not all FullToken have entry into dictionary - * @b - * Modified @date Dec, 02 2002 by GC to handle splitting on t_alpha_possessive - * - * @file HyphenWordAlternatives.cpp - * @author NAUTITIA jys - * @author Gael de Chalendar - * @author Copyright (c) 2002-2003 by CEA - * - * @date created on Nov, 30 2002 - * @version $Id$ - * - */ - -#include "HyphenWordAlternatives.h" -#include "MorphoSyntacticDataHandler.h" - -#include "common/Data/LimaString.h" -#include "common/Data/strwstrtools.h" -#include "common/MediaticData/mediaticData.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" -#include "linguisticProcessing/client/LinguisticProcessingException.h" -#include "common/time/timeUtilsController.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" -#include "common/MediaProcessors/MediaProcessors.h" -#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" - -using namespace std; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::AnnotationGraphs; -using namespace Lima::LinguisticProcessing::AnalysisDict; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace MorphologicAnalysis -{ - -SimpleFactory hyphenwordAlternativesFactory(HYPHENWORDALTERNATIVESFACTORY_CLASSID); - -HyphenWordAlternatives::HyphenWordAlternatives() -{} - -HyphenWordAlternatives::~HyphenWordAlternatives() -{} - -void HyphenWordAlternatives::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - MORPHOLOGINIT; - m_language = manager->getInitializationParameters().media; - try - { - string dico=unitConfiguration.getParamsValueAtKey("dictionary"); - AbstractResource* res=LinguisticResources::single().getResource(m_language,dico); - m_dictionary=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - string charchart=unitConfiguration.getParamsValueAtKey("charChart"); - AbstractResource* res=LinguisticResources::single().getResource(m_language,charchart); - m_charChart=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'charChart' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - string tok=unitConfiguration.getParamsValueAtKey("tokenizer"); - const MediaProcessUnit* res=manager->getObject(tok); - m_tokenizer=static_cast(res); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; - throw InvalidConfiguration(); - } - try - { - m_deleteHyphenWord=( unitConfiguration.getParamsValueAtKey("deleteHyphenWord") == "true"); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no param 'deleteHyphenWord' in HyphenAlternatives group for language " << (int) m_language; - LWARN << "use default value : true"; - m_deleteHyphenWord=true; - } - try - { - string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); - m_confidentMode=(confident=="true"); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no param 'confidentMode' in HyphenWordAlternatives group for language " << (int) m_language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); - m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,sp); -} - -LimaStatusCode HyphenWordAlternatives::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("HyphenWordAlternatives"); - MORPHOLOGINIT; - LINFO << "MorphologicalAnalysis: starting process HyphenWordAlternatives"; - - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - if (annotationData==0) - { - LDEBUG << "HyphenWordAlternatives::process: Misssing AnnotationData. Create it"; - annotationData = new AnnotationData(); - if (static_cast(analysis.getData("AnalysisGraph")) != 0) - { - static_cast(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph"); - } - analysis.setData("AnnotationData",annotationData); - } - - AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); - LinguisticGraph* graph=tokenList->getGraph(); - - VertexDataPropertyMap dataMap = get( vertex_data, *graph ); - VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); - - try - { - LinguisticGraphVertexIt it, it_end; - boost::tie(it, it_end) = vertices(*graph); - for (; it != it_end; it++) - { - MorphoSyntacticData* currentToken = dataMap[*it]; - Token* tok= tokenMap[*it]; - if (currentToken==0) continue; - // - if (currentToken->size() == 0) - { - if (tok->status().isAlphaHyphen()) - { - makeHyphenSplitAlternativeFor(*it, graph, annotationData); - } - } - } - } - catch (std::exception &exc) - { - MORPHOLOGINIT; - LWARN << "Exception in HyphenWordAlternatives : " << exc.what(); - return UNKNOWN_ERROR; - } - - LINFO << "MorphologicalAnalysis: ending process HyphenWordAlternatives"; - return SUCCESS_ID; -} - -void HyphenWordAlternatives::makeHyphenSplitAlternativeFor( - LinguisticGraphVertex splitted, - LinguisticGraph* graph, - AnnotationData* annotationData) const -{ - VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); - VertexDataPropertyMap dataMap = get( vertex_data, *graph ); - Token* currentToken = tokenMap[splitted]; - - // first, get a copy of token string - LimaString hyphenWord(currentToken->stringForm()); - // first replace hyphens by spaces - int pos = hyphenWord.indexOf(LimaChar(L'-'), 0); - while (pos != -1) - { - hyphenWord[(int)pos] = LimaChar(L' '); - pos = hyphenWord.indexOf(LimaChar(L'-'), pos+1); - } - // then submit string to Tokenizer - AnalysisContent toTokenize; - toTokenize.setData("Text",new LimaStringText(hyphenWord)); - LimaStatusCode status=m_tokenizer->process(toTokenize); - if (status != SUCCESS_ID) return; - AnalysisGraph* agTokenizer=static_cast(toTokenize.getData("AnalysisGraph")); - LinguisticGraph* tokgraph=agTokenizer->getGraph(); - - // setup position field - // insert each new FullToken into alternative path - uint64_t beginPos = currentToken->position()-1; - LinguisticGraphVertex previous = splitted; - LinguisticGraphVertex currentVx=agTokenizer->firstVertex(); - // go one step forward on the new path - { - LinguisticGraphAdjacencyIt adjItr,adjItrEnd; - boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); - if (adjItr==adjItrEnd) - { - MORPHOLOGINIT; - LERROR << "HypenWordAlternatives : no token forward !"; - throw LinguisticProcessingException(); - } - currentVx=*adjItr; - } - // LinguisticGraphVertex lastVx=agTokenizer->lastVertex(); - VertexTokenPropertyMap tokTokenMap=get(vertex_token,*tokgraph); - Token* tokenizerToken=tokTokenMap[currentVx]; - - bool isFirst=true; - - while (tokenizerToken) - { - // prepare the new vertex - Token* newFT=new Token(*tokenizerToken); - newFT->status().setAlphaHyphen( true ); - MorphoSyntacticData* newData=new MorphoSyntacticData(); - LinguisticGraphVertex newVertex = add_vertex(*graph); - - AnnotationGraphVertex agv = annotationData->createAnnotationVertex(); - annotationData->addMatching("AnalysisGraph", newVertex, "annot", agv); - annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), newVertex); - - - tokenMap[newVertex]=newFT; - dataMap[newVertex]=newData; - newFT-> setPosition(newFT->position() + beginPos); - const LimaString& newTokenStr=newFT->stringForm(); - MorphoSyntacticDataHandler handler(*newData,HYPHEN_ALTERNATIVE); - - if (isFirst) - { - LimaString newTokHyphen(newTokenStr); - newTokHyphen.append(LimaChar('-')); - DictionaryEntry dicoEntry(m_dictionary->getEntry(newTokHyphen)); - if (!dicoEntry.isEmpty() && dicoEntry.hasLingInfos()) - { - dicoEntry.parseLingInfos(&handler); - } else { - m_reader->readAlternatives( - *newFT, - *m_dictionary, - &handler, - 0, - &handler); - } - } - else - { - m_reader->readAlternatives( - *newFT, - *m_dictionary, - &handler, - 0, - &handler); - } - - // links the new vertex to its predecessor in the graph - if (previous == splitted) - { - LinguisticGraphInEdgeIt ite, ite_end; - boost::tie(ite, ite_end) = in_edges(splitted, *graph); - for (; ite != ite_end; ite++) - { - add_edge(source(*ite,*graph), newVertex, *graph); - } - } - else - { - add_edge(previous, newVertex, *graph); - } - previous = newVertex; - // go one step forward on the new path - LinguisticGraphAdjacencyIt adjItr,adjItrEnd; - boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); - if (adjItr==adjItrEnd) - { - MORPHOLOGINIT; - LERROR << "HypenWordAlternatives : no token forward !"; - throw LinguisticProcessingException(); - } - currentVx=*adjItr; - tokenizerToken=tokTokenMap[currentVx]; - } - - // links the last new vertex created to the successors of the splitted vertex - LinguisticGraphOutEdgeIt ite, ite_end; - boost::tie(ite, ite_end) = out_edges(splitted, *graph); - for (; ite != ite_end; ite++) - { - add_edge(previous, target(*ite,*graph), *graph); - } - - // if have to delete hyphen word, then clear it in the graph - if (m_deleteHyphenWord) - { - clear_vertex(splitted,*graph); - } -} - -} // closing namespace MorphologicAnalysis -} // closing namespace LinguisticProcessing -} // closing namespace Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/** + * @brief HyphenWordAlternatives is the module which creates split alternatives + * for hyphen word tokens. Each token from the supplied tokens path is processed : + * o FullToken must be "AlphaHyphen" typed by Tokenizer. + * o If a token has a single word entry or an orthographic alternative + * it is not decomposed + * o Token is break at hyphen boundaries and a new alternative path is created + * o each FullToken of the new Path is searched into dictionnary as Simple Word + * o If special hyphen entry, no alternatives are searched, + * otherwise Accented alternatives are searched + * o Path is valid even if not all FullToken have entry into dictionary + * @b + * Modified @date Dec, 02 2002 by GC to handle splitting on t_alpha_possessive + * + * @file HyphenWordAlternatives.cpp + * @author NAUTITIA jys + * @author Gael de Chalendar + * @author Copyright (c) 2002-2003 by CEA + * + * @date created on Nov, 30 2002 + * @version $Id$ + * + */ + +#include "HyphenWordAlternatives.h" +#include "MorphoSyntacticDataHandler.h" + +#include "common/Data/LimaString.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/client/LinguisticProcessingException.h" +#include "common/time/timeUtilsController.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "common/MediaProcessors/MediaProcessors.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" + +using namespace std; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::AnalysisDict; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + +SimpleFactory hyphenwordAlternativesFactory(HYPHENWORDALTERNATIVESFACTORY_CLASSID); + +HyphenWordAlternatives::HyphenWordAlternatives() +{} + +HyphenWordAlternatives::~HyphenWordAlternatives() +{ + delete m_reader; +} + +void HyphenWordAlternatives::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + MORPHOLOGINIT; + m_language = manager->getInitializationParameters().media; + try + { + string dico=unitConfiguration.getParamsValueAtKey("dictionary"); + AbstractResource* res=LinguisticResources::single().getResource(m_language,dico); + m_dictionary=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + string charchart=unitConfiguration.getParamsValueAtKey("charChart"); + AbstractResource* res=LinguisticResources::single().getResource(m_language,charchart); + m_charChart=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'charChart' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + string tok=unitConfiguration.getParamsValueAtKey("tokenizer"); + const MediaProcessUnit* res=manager->getObject(tok); + m_tokenizer=static_cast(res); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no param 'dictionary' in HyphenWordAlternatives group for language " << (int) m_language; + throw InvalidConfiguration(); + } + try + { + m_deleteHyphenWord=( unitConfiguration.getParamsValueAtKey("deleteHyphenWord") == "true"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'deleteHyphenWord' in HyphenAlternatives group for language " << (int) m_language; + LWARN << "use default value : true"; + m_deleteHyphenWord=true; + } + try + { + string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); + m_confidentMode=(confident=="true"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in HyphenWordAlternatives group for language " << (int) m_language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + FsaStringsPool* sp=&Common::MediaticData::MediaticData::changeable().stringsPool(m_language); + m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,sp); +} + +LimaStatusCode HyphenWordAlternatives::process( + AnalysisContent& analysis) const +{ + Lima::TimeUtilsController timer("HyphenWordAlternatives"); + MORPHOLOGINIT; + LINFO << "MorphologicalAnalysis: starting process HyphenWordAlternatives"; + +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) + { + LDEBUG << "HyphenWordAlternatives::process: Misssing AnnotationData. Create it"; + annotationData = new AnnotationData(); + if (static_cast(analysis.getData("AnalysisGraph")) != 0) + { + static_cast(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph"); + } + analysis.setData("AnnotationData",annotationData); + } + + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + LinguisticGraph* graph=tokenList->getGraph(); + + VertexDataPropertyMap dataMap = get( vertex_data, *graph ); + VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); + + try + { + LinguisticGraphVertexIt it, it_end; + boost::tie(it, it_end) = vertices(*graph); + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in HyphenWordAlternatives"; + return TIME_OVERFLOW; + } +#endif + MorphoSyntacticData* currentToken = dataMap[*it]; + Token* tok= tokenMap[*it]; + if (currentToken==0) continue; + // + if (currentToken->size() == 0) + { + if (tok->status().isAlphaHyphen()) + { + makeHyphenSplitAlternativeFor(*it, graph, annotationData); + } + } + } + } + catch (std::exception &exc) + { + MORPHOLOGINIT; + LWARN << "Exception in HyphenWordAlternatives : " << exc.what(); + return UNKNOWN_ERROR; + } + + LINFO << "MorphologicalAnalysis: ending process HyphenWordAlternatives"; + return SUCCESS_ID; +} + +void HyphenWordAlternatives::makeHyphenSplitAlternativeFor( + LinguisticGraphVertex splitted, + LinguisticGraph* graph, + AnnotationData* annotationData) const +{ + VertexTokenPropertyMap tokenMap = get( vertex_token, *graph ); + VertexDataPropertyMap dataMap = get( vertex_data, *graph ); + Token* currentToken = tokenMap[splitted]; + + // first, get a copy of token string + LimaString hyphenWord(currentToken->stringForm()); + // first replace hyphens by spaces + int pos = hyphenWord.indexOf(LimaChar(L'-'), 0); + while (pos != -1) + { + hyphenWord[(int)pos] = LimaChar(L' '); + pos = hyphenWord.indexOf(LimaChar(L'-'), pos+1); + } + // then submit string to Tokenizer + AnalysisContent toTokenize; + toTokenize.setData("Text",new LimaStringText(hyphenWord)); + LimaStatusCode status=m_tokenizer->process(toTokenize); + if (status != SUCCESS_ID) return; + AnalysisGraph* agTokenizer=static_cast(toTokenize.getData("AnalysisGraph")); + LinguisticGraph* tokgraph=agTokenizer->getGraph(); + + // setup position field + // insert each new FullToken into alternative path + uint64_t beginPos = currentToken->position()-1; + LinguisticGraphVertex previous = splitted; + LinguisticGraphVertex currentVx=agTokenizer->firstVertex(); + // go one step forward on the new path + { + LinguisticGraphAdjacencyIt adjItr,adjItrEnd; + boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); + if (adjItr==adjItrEnd) + { + MORPHOLOGINIT; + LERROR << "HypenWordAlternatives : no token forward !"; + throw LinguisticProcessingException(); + } + currentVx=*adjItr; + } + // LinguisticGraphVertex lastVx=agTokenizer->lastVertex(); + VertexTokenPropertyMap tokTokenMap=get(vertex_token,*tokgraph); + Token* tokenizerToken=tokTokenMap[currentVx]; + + bool isFirst=true; + + while (tokenizerToken) + { + // prepare the new vertex + Token* newFT=new Token(*tokenizerToken); + newFT->status().setAlphaHyphen( true ); + MorphoSyntacticData* newData=new MorphoSyntacticData(); + LinguisticGraphVertex newVertex = add_vertex(*graph); + + AnnotationGraphVertex agv = annotationData->createAnnotationVertex(); + annotationData->addMatching("AnalysisGraph", newVertex, "annot", agv); + annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), newVertex); + + + tokenMap[newVertex]=newFT; + dataMap[newVertex]=newData; + newFT-> setPosition(newFT->position() + beginPos); + const LimaString& newTokenStr=newFT->stringForm(); + MorphoSyntacticDataHandler handler(*newData,HYPHEN_ALTERNATIVE); + + if (isFirst) + { + LimaString newTokHyphen(newTokenStr); + newTokHyphen.append(LimaChar('-')); + DictionaryEntry dicoEntry(m_dictionary->getEntry(newTokHyphen)); + if (!dicoEntry.isEmpty() && dicoEntry.hasLingInfos()) + { + dicoEntry.parseLingInfos(&handler); + } else { + m_reader->readAlternatives( + *newFT, + *m_dictionary, + &handler, + 0, + &handler); + } + } + else + { + m_reader->readAlternatives( + *newFT, + *m_dictionary, + &handler, + 0, + &handler); + } + + // links the new vertex to its predecessor in the graph + if (previous == splitted) + { + LinguisticGraphInEdgeIt ite, ite_end; + boost::tie(ite, ite_end) = in_edges(splitted, *graph); + for (; ite != ite_end; ite++) + { + add_edge(source(*ite,*graph), newVertex, *graph); + } + } + else + { + add_edge(previous, newVertex, *graph); + } + previous = newVertex; + // go one step forward on the new path + LinguisticGraphAdjacencyIt adjItr,adjItrEnd; + boost::tie(adjItr,adjItrEnd) = adjacent_vertices(currentVx,*tokgraph); + if (adjItr==adjItrEnd) + { + MORPHOLOGINIT; + LERROR << "HypenWordAlternatives : no token forward !"; + throw LinguisticProcessingException(); + } + currentVx=*adjItr; + tokenizerToken=tokTokenMap[currentVx]; + } + + // links the last new vertex created to the successors of the splitted vertex + LinguisticGraphOutEdgeIt ite, ite_end; + boost::tie(ite, ite_end) = out_edges(splitted, *graph); + for (; ite != ite_end; ite++) + { + add_edge(previous, target(*ite,*graph), *graph); + } + + // if have to delete hyphen word, then clear it in the graph + if (m_deleteHyphenWord) + { + clear_vertex(splitted,*graph); + } +} + +} // closing namespace MorphologicAnalysis +} // closing namespace LinguisticProcessing +} // closing namespace Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp index 09238c128..6d6a834cf 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/IdiomaticAlternativesConstraints.cpp @@ -180,13 +180,22 @@ bool CreateIdiomaticAlternative::operator()(Automaton::RecognizerMatch& result, { // ignore current idiomatic expression, continue MORPHOLOGINIT; - LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) + LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) << ": overlapping with a previous one"; return false; } // create the new token std::pair newToken = createAlternativeToken(result); + if (newToken.second->empty()) + { + // ignore current idiomatic expression, continue + MORPHOLOGINIT; + LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort"; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex idiomaticVertex = @@ -241,13 +250,22 @@ bool CreateIdiomaticAlternative::operator()(Automaton::RecognizerMatch& result, { // ignore current idiomatic expression, continue MORPHOLOGINIT; - LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) + LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString()) << ": overlapping with a previous one"; return false; } // create the new token pair newToken = createAlternativeToken(result); + if (newToken.second->empty()) + { + // ignore current idiomatic expression, continue + MORPHOLOGINIT; + LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort"; + delete newToken.first; + delete newToken.second; + return false; + } // add the vertex LinguisticGraphVertex idiomaticVertex = diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp index fbeeb44c0..d58dbe53b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/OrthographicAlternatives.cpp @@ -127,13 +127,13 @@ LimaStatusCode OrthographicAlternatives::process( for (;it!=itEnd;it++) { LDEBUG << "processing vertex " << *it; - MorphoSyntacticData* currentToken=dataMap[*it]; + MorphoSyntacticData* currentTokenData=dataMap[*it]; Token* tok=tokenMap[*it]; - if (currentToken!=0) + if (currentTokenData!=0) { // if in confidentMode and token has already ling infos, skip - if ( m_confidentMode && (currentToken->size()>0) ) continue; + if ( m_confidentMode && (currentTokenData->size()>0) ) continue; // set orthographic alternatives given by dictionary // using the alternatives directly given by the morphosyntactic data @@ -145,20 +145,20 @@ LimaStatusCode OrthographicAlternatives::process( LimaString oa = entry->nextAccented(); while ( oa.size() > 0 ) { - createAlternative(tok,currentToken,oa,m_dictionary,sp); + createAlternative(tok,currentTokenData,oa,m_dictionary,sp); oa = entry->nextAccented(); } } } // if in confidentMode and token has already ling infos, skip - if (m_confidentMode && (currentToken->size() > 0) ) continue; + if (m_confidentMode && (currentTokenData->size() > 0) ) continue; // if no ling infos, then lower and unmark string LDEBUG << "set unmark alternatives"; setOrthographicAlternatives( tok, - currentToken, + currentTokenData, m_dictionary, m_charChart, sp); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp index f428c8dc0..9ab1424e0 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/MorphologicAnalysis/SimpleWord.cpp @@ -1,271 +1,278 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ - -// NAUTITIA -// -// jys 8-OCT-2002 -// -// SimpleWord is the implementation of the 1st module of -// Morphological Analysis. Each token from the main tokens -// path is searched into the specified dictionary. - - -#include "SimpleWord.h" - -#include "linguisticProcessing/LinguisticProcessingCommon.h" -#include "common/MediaticData/mediaticData.h" -#include "common/time/timeUtilsController.h" -#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/core/AnalysisDict/AbstractDictionaryEntry.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/misc/fsaStringsPool.h" -#include "MorphoSyntacticDataHandler.h" -#include "ConcatenatedDataHandler.h" -#include "AccentedConcatenatedDataHandler.h" -#include "SequenceEntryHandler.h" - -#include - -using namespace std; -using namespace Lima::Common::AnnotationGraphs; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::AnalysisDict; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace MorphologicAnalysis -{ - -SimpleFactory SimpleWordFactory(SIMPLEWORD_CLASSID); - - -SimpleWord::SimpleWord() : - m_reader(0) -{} - -SimpleWord::~SimpleWord() -{ - if (m_reader==0) - { - delete m_reader; - } -} - -void SimpleWord::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) -{ - MORPHOLOGINIT; - MediaId language = manager->getInitializationParameters().media; - m_sp=&Common::MediaticData::MediaticData::changeable().stringsPool(language); - string dico; - try - { - dico=unitConfiguration.getParamsValueAtKey("dictionary"); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'dictionary' in SimpleWord group for language " << (int) language; - throw InvalidConfiguration(); - } - - AbstractResource* res=LinguisticResources::single().getResource(language,dico); - m_dictionary=static_cast(res); - - try - { - string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); - LDEBUG << "SimpleWord set confident mode to:" << confident; - m_confidentMode=(confident=="true"); - } - catch (NoSuchParam& ) - { - LWARN << "no param 'confidentMode' in SimpleWord group for language " << (int) language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - - // initialize dictionary reader - - try - { - string chart=unitConfiguration.getParamsValueAtKey("charChart"); - AbstractResource* res= LinguisticResources::single().getResource(language,chart); - m_charChart=static_cast(res); - } - catch (NoSuchParam& ) - { - LERROR << "no param 'charChart' in SimpleWord group for language " << (int) language; - throw InvalidConfiguration(); - } - - m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,&Common::MediaticData::MediaticData::changeable().stringsPool(language)); - - try - { - string concat=unitConfiguration.getParamsValueAtKey("parseConcatenated"); - m_parseConcatenated=(concat=="true"); - } - catch (NoSuchParam& ) - { - LWARN << "no param 'parseConcatenated' in SimpleWord group for language " << (int) language; - LWARN << "use default value : 'true'"; - m_confidentMode=true; - } - -} - - -LimaStatusCode SimpleWord::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("SimpleWord"); - MORPHOLOGINIT; - LINFO << "starting process SimpleWord"; - - AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); - - - LinguisticGraph* g=tokenList->getGraph(); - LinguisticGraphVertexIt it,itEnd; - VertexTokenPropertyMap tokenMap=get(vertex_token,*g); - VertexDataPropertyMap dataMap=get(vertex_data,*g); - boost::tie(it,itEnd)=vertices(*g); - for (;it!=itEnd;it++) - { - Token* currentToken=tokenMap[*it]; - if (currentToken!=0) - { -#ifdef DEBUG_LP - LDEBUG << "SimpleWord for token" << currentToken->stringForm(); -#endif - // Init handlers - MorphoSyntacticData* msd=dataMap[*it]; - AbstractDictionaryEntryHandler* lingInfoHandler=new MorphoSyntacticDataHandler(*msd,SIMPLE_WORD); - ConcatenatedDataHandler* concatHandler=0; - AccentedConcatenatedDataHandler* accentedConcatHandler=0; - AbstractDictionaryEntryHandler* accentedHandler=lingInfoHandler; - - if (m_parseConcatenated) { - concatHandler=new ConcatenatedDataHandler(g,currentToken,SIMPLE_WORD,m_sp); - accentedConcatHandler=new AccentedConcatenatedDataHandler( - g, - currentToken->stringForm(), - currentToken->position(), - currentToken->status(), - SIMPLE_WORD, - m_sp, - m_charChart); - SequenceEntryHandler* seh=new SequenceEntryHandler(); - seh->addHandler(lingInfoHandler); - seh->addHandler(accentedConcatHandler); - accentedHandler=seh; - } - - // parse data - DictionaryEntry entry(m_dictionary->getEntry(currentToken->form(),currentToken->stringForm())); - m_reader->readAlternatives( - *currentToken, - *m_dictionary, - lingInfoHandler, - concatHandler, - accentedHandler); - - // finalize - if (concatHandler && !concatHandler->getConcatVertices().empty()) { - linkConcatVertices(g,*it,concatHandler->getConcatVertices()); - if (msd->empty()) { - clear_vertex(*it,*g); - } - } - if (accentedConcatHandler && !accentedConcatHandler->getConcatVertices().empty()) { - linkConcatVertices(g,*it,accentedConcatHandler->getConcatVertices()); - if (msd->empty()) { - clear_vertex(*it,*g); - } - } - if (m_parseConcatenated) { - delete concatHandler; - delete accentedConcatHandler; - delete accentedHandler; - } - delete lingInfoHandler; - - } - } - - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - if (annotationData==0) - { - LINFO << "SimpleWord::process no annotation data, creating and populating it"; - annotationData=new AnnotationData(); - analysis.setData("AnnotationData",annotationData); - } - tokenList->populateAnnotationGraph(annotationData, "AnalysisGraph"); - if (static_cast(analysis.getData("PosGraph")) != 0) - { - static_cast(analysis.getData("PosGraph"))->populateAnnotationGraph(annotationData, "PosGraph"); - } - -#ifdef DEBUG_LP - LDEBUG << "ending process SimpleWord"; -#endif - return SUCCESS_ID; -} - -void SimpleWord::linkConcatVertices( - LinguisticGraph* graph, - LinguisticGraphVertex srcToken, - const std::vector >& concats) const -{ - LinguisticGraphInEdgeIt ieItr,ieItrEnd; - for (boost::tie(ieItr,ieItrEnd) = in_edges(srcToken,*graph); - ieItr!=ieItrEnd; - ieItr++) - { - LinguisticGraphVertex pred=source(*ieItr,*graph); - LinguisticGraphOutEdgeIt oeItr,oeItrEnd; - for (boost::tie(oeItr,oeItrEnd) = out_edges(srcToken,*graph); - oeItr!=oeItrEnd; - oeItr++) - { - LinguisticGraphVertex next=target(*oeItr,*graph); - for (std::vector >::const_iterator concatItr=concats.begin(); - concatItr!=concats.end(); - concatItr++) - { - add_edge(pred,concatItr->front(),*graph); - add_edge(concatItr->back(),next,*graph); - } - } - } -} - - -} // MorphologicAnalysis -} // LinguisticProcessing -} // Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +// NAUTITIA +// +// jys 8-OCT-2002 +// +// SimpleWord is the implementation of the 1st module of +// Morphological Analysis. Each token from the main tokens +// path is searched into the specified dictionary. + + +#include "SimpleWord.h" + +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/AnalysisDict/AbstractDictionaryEntry.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/misc/fsaStringsPool.h" +#include "MorphoSyntacticDataHandler.h" +#include "ConcatenatedDataHandler.h" +#include "AccentedConcatenatedDataHandler.h" +#include "SequenceEntryHandler.h" + +#include + +using namespace std; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::AnalysisDict; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace MorphologicAnalysis +{ + +SimpleFactory SimpleWordFactory(SIMPLEWORD_CLASSID); + + +SimpleWord::SimpleWord() : + m_reader(0) +{} + +SimpleWord::~SimpleWord() +{ + delete m_reader; +} + +void SimpleWord::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + MORPHOLOGINIT; + MediaId language = manager->getInitializationParameters().media; + m_sp=&Common::MediaticData::MediaticData::changeable().stringsPool(language); + string dico; + try + { + dico=unitConfiguration.getParamsValueAtKey("dictionary"); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'dictionary' in SimpleWord group for language " << (int) language; + throw InvalidConfiguration(); + } + + AbstractResource* res=LinguisticResources::single().getResource(language,dico); + m_dictionary=static_cast(res); + + try + { + string confident=unitConfiguration.getParamsValueAtKey("confidentMode"); + LDEBUG << "SimpleWord set confident mode to:" << confident; + m_confidentMode=(confident=="true"); + } + catch (NoSuchParam& ) + { + LWARN << "no param 'confidentMode' in SimpleWord group for language " << (int) language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + + // initialize dictionary reader + + try + { + string chart=unitConfiguration.getParamsValueAtKey("charChart"); + AbstractResource* res= LinguisticResources::single().getResource(language,chart); + m_charChart=static_cast(res); + } + catch (NoSuchParam& ) + { + LERROR << "no param 'charChart' in SimpleWord group for language " << (int) language; + throw InvalidConfiguration(); + } + + m_reader=new AlternativesReader(m_confidentMode,true,true,true,m_charChart,&Common::MediaticData::MediaticData::changeable().stringsPool(language)); + + try + { + string concat=unitConfiguration.getParamsValueAtKey("parseConcatenated"); + m_parseConcatenated=(concat=="true"); + } + catch (NoSuchParam& ) + { + LWARN << "no param 'parseConcatenated' in SimpleWord group for language " << (int) language; + LWARN << "use default value : 'true'"; + m_confidentMode=true; + } + +} + + +LimaStatusCode SimpleWord::process( + AnalysisContent& analysis) const +{ + Lima::TimeUtilsController timer("SimpleWord"); + MORPHOLOGINIT; + LINFO << "starting process SimpleWord"; + +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnalysisGraph* tokenList=static_cast(analysis.getData("AnalysisGraph")); + + + LinguisticGraph* g=tokenList->getGraph(); + LinguisticGraphVertexIt it,itEnd; + VertexTokenPropertyMap tokenMap=get(vertex_token,*g); + VertexDataPropertyMap dataMap=get(vertex_data,*g); + boost::tie(it,itEnd)=vertices(*g); + for (;it!=itEnd;it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SimpleWord"; + return TIME_OVERFLOW; + } +#endif + Token* currentToken=tokenMap[*it]; + if (currentToken!=0) + { +#ifdef DEBUG_LP + LDEBUG << "SimpleWord for token" << currentToken->stringForm(); +#endif + // Init handlers + MorphoSyntacticData* msd=dataMap[*it]; + AbstractDictionaryEntryHandler* lingInfoHandler=new MorphoSyntacticDataHandler(*msd,SIMPLE_WORD); + ConcatenatedDataHandler* concatHandler=0; + AccentedConcatenatedDataHandler* accentedConcatHandler=0; + AbstractDictionaryEntryHandler* accentedHandler=lingInfoHandler; + + if (m_parseConcatenated) { + concatHandler=new ConcatenatedDataHandler(g,currentToken,SIMPLE_WORD,m_sp); + accentedConcatHandler=new AccentedConcatenatedDataHandler( + g, + currentToken->stringForm(), + currentToken->position(), + currentToken->status(), + SIMPLE_WORD, + m_sp, + m_charChart); + SequenceEntryHandler* seh=new SequenceEntryHandler(); + seh->addHandler(lingInfoHandler); + seh->addHandler(accentedConcatHandler); + accentedHandler=seh; + } + + // parse data + DictionaryEntry entry(m_dictionary->getEntry(currentToken->form(),currentToken->stringForm())); + m_reader->readAlternatives( + *currentToken, + *m_dictionary, + lingInfoHandler, + concatHandler, + accentedHandler); + + // finalize + if (concatHandler && !concatHandler->getConcatVertices().empty()) { + linkConcatVertices(g,*it,concatHandler->getConcatVertices()); + if (msd->empty()) { + clear_vertex(*it,*g); + } + } + if (accentedConcatHandler && !accentedConcatHandler->getConcatVertices().empty()) { + linkConcatVertices(g,*it,accentedConcatHandler->getConcatVertices()); + if (msd->empty()) { + clear_vertex(*it,*g); + } + } + if (m_parseConcatenated) { + delete concatHandler; + delete accentedConcatHandler; + delete accentedHandler; + } + delete lingInfoHandler; + + } + } + + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + if (annotationData==0) + { + LINFO << "SimpleWord::process no annotation data, creating and populating it"; + annotationData=new AnnotationData(); + analysis.setData("AnnotationData",annotationData); + } + tokenList->populateAnnotationGraph(annotationData, "AnalysisGraph"); + if (static_cast(analysis.getData("PosGraph")) != 0) + { + static_cast(analysis.getData("PosGraph"))->populateAnnotationGraph(annotationData, "PosGraph"); + } + +#ifdef DEBUG_LP + LDEBUG << "ending process SimpleWord"; +#endif + return SUCCESS_ID; +} + +void SimpleWord::linkConcatVertices( + LinguisticGraph* graph, + LinguisticGraphVertex srcToken, + const std::vector >& concats) const +{ + LinguisticGraphInEdgeIt ieItr,ieItrEnd; + for (boost::tie(ieItr,ieItrEnd) = in_edges(srcToken,*graph); + ieItr!=ieItrEnd; + ieItr++) + { + LinguisticGraphVertex pred=source(*ieItr,*graph); + LinguisticGraphOutEdgeIt oeItr,oeItrEnd; + for (boost::tie(oeItr,oeItrEnd) = out_edges(srcToken,*graph); + oeItr!=oeItrEnd; + oeItr++) + { + LinguisticGraphVertex next=target(*oeItr,*graph); + for (std::vector >::const_iterator concatItr=concats.begin(); + concatItr!=concats.end(); + concatItr++) + { + add_edge(pred,concatItr->front(),*graph); + add_edge(concatItr->back(),next,*graph); + } + } + } +} + + +} // MorphologicAnalysis +} // LinguisticProcessing +} // Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp index 820002935..4ed337397 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/DynamicSvmToolPosTagger.cpp @@ -25,6 +25,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" #include "common/MediaticData/mediaticData.h" #include "common/time/traceUtils.h" #include "linguisticProcessing/common/annotationGraph/AnnotationData.h" @@ -46,7 +47,9 @@ #include #include #include +#ifdef ANTINNO_SPECIFIC #include +#endif #include // LDBL_MIN/MAX #include // log @@ -148,7 +151,7 @@ void DynamicSvmToolPosTagger::init( // Creates the tagger we use erCompRegExp(); - t = new tagger(resourcesPath + "/" + model); + t = new tagger(Common::Misc::findFileInPaths(resourcesPath.c_str(), model.c_str()).toUtf8().constData()); t->taggerLoadModelsForTagging(); t->taggerShowComments(); t->taggerActiveShowScoresFlag(); @@ -195,8 +198,13 @@ LimaStatusCode DynamicSvmToolPosTagger::process(AnalysisContent& analysis) const std::map maxAncestor; /* Push every vertex coming from vertex 0 onto the "tokens to be visited" list */ +#ifdef ANTINNO_SPECIFIC BOOST_FOREACH(LinguisticGraphVertex vertex, - nextTokens(analysisGraph->firstVertex(), srcGraph)) + nextTokens(analysisGraph->firstVertex(), srcGraph)) +#else + for(LinguisticGraphVertex vertex: + nextTokens(analysisGraph->firstVertex(), srcGraph)) +#endif { tokenQueue.push(vertex); } @@ -218,7 +226,11 @@ LimaStatusCode DynamicSvmToolPosTagger::process(AnalysisContent& analysis) const /* For every ancestor of our node */ std::set previousTokens = getPreviousTokens(vertex, srcGraph); if(previousTokens.empty()) previousTokens.insert(posGraph->firstVertex()); +#ifdef ANTINNO_SPECIFIC BOOST_FOREACH(LinguisticGraphVertex prevVertex, previousTokens) { +#else + for(LinguisticGraphVertex prevVertex: previousTokens) { +#endif std::string pos = ""; double logCurWeight = log(1.0), w; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp index b8871d1e4..a518b7804 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/SvmToolPosTagger.cpp @@ -27,14 +27,12 @@ #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/time/timeUtilsController.h" #include "svmtool/tagger.h" -#include -#include -#include - +#include int verbose = FALSE; @@ -89,7 +87,10 @@ void SvmToolPosTagger::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - m_model = resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("model"); + string modelName=unitConfiguration.getParamsValueAtKey("model"); + // add .DICT to find the file, remove it to get the generic model name + path + m_model = Common::Misc::findFileInPaths(resourcesPath.c_str(), modelName.append(".DICT").c_str()).toUtf8().constData(); + boost::replace_last(m_model,".DICT",""); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp index 26e64e006..58e3c856d 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.cpp @@ -24,6 +24,7 @@ #include "ViterbiPosTagger.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "common/tools/FileUtils.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "integerCost.h" @@ -38,7 +39,7 @@ namespace LinguisticProcessing namespace PosTagger { -ViterbiPosTaggerFactory* ViterbiPosTaggerFactory::s_instance=new ViterbiPosTaggerFactory(VITERBIPOSTAGGER_CLASSID); +std::unique_ptr< ViterbiPosTaggerFactory > ViterbiPosTaggerFactory::s_instance=std::unique_ptr< ViterbiPosTaggerFactory >(new ViterbiPosTaggerFactory(VITERBIPOSTAGGER_CLASSID)); ViterbiPosTaggerFactory::ViterbiPosTaggerFactory(const std::string& id) : @@ -73,8 +74,8 @@ MediaProcessUnit* ViterbiPosTaggerFactory::create( string resourcesPath=MediaticData::single().getResourcesPath(); try { - trigramsFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("trigramFile"); - bigramsFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("bigramFile"); + trigramsFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("trigramFile").c_str()).toUtf8().constData(); + bigramsFile=Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("bigramFile").c_str()).toUtf8().constData(); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h index 696efca31..68fcf255f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ViterbiPosTagger.h @@ -199,7 +199,7 @@ class LIMA_POSTAGGER_EXPORT ViterbiPosTaggerFactory : public InitializableObject private: ViterbiPosTaggerFactory(const std::string& id); - static ViterbiPosTaggerFactory* s_instance; + static std::unique_ptr< ViterbiPosTaggerFactory > s_instance; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp index 6fdc50576..ab97892fe 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/PosTagger/ngramMatrices.cpp @@ -26,6 +26,7 @@ */ #include "ngramMatrices.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/MediaticData/mediaticData.h" #include "linguisticProcessing/common/linguisticData/languageData.h" @@ -68,7 +69,7 @@ void TrigramMatrix::init( string resourcesPath=MediaticData::single().getResourcesPath(); try { - string trigramFile=resourcesPath + "/" + unitConfiguration.getParamsValueAtKey("trigramFile"); + string trigramFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), unitConfiguration.getParamsValueAtKey("trigramFile").c_str()).toUtf8().constData(); readTrigramMatrixFile(trigramFile); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) @@ -96,9 +97,8 @@ void TrigramMatrix::readTrigramMatrixFile(const std::string& fileName) boost::regex linere("^(.+)\t(.+)\t(.+)\t(\\d+(\\.\\d+)?)$"); boost::regex numre("^\\d+$"); - std::string lineString; + std::string lineString = Lima::Common::Misc::readLine(ifl); size_t linenum(0); - getline(ifl, lineString); while (ifl.good() && !ifl.eof()) { Common::Misc::chomp(lineString); @@ -140,7 +140,7 @@ void TrigramMatrix::readTrigramMatrixFile(const std::string& fileName) // LDEBUG << "Got trigram: ["<& regexes = unitConfiguration.getMapAtKey("regexes"); for (std::map ::const_iterator it = regexes.begin(); it != regexes.end(); it++) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt index af63988ed..3f19bb33a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/CMakeLists.txt @@ -21,14 +21,56 @@ add_definitions(-DLIMA_SEMANTICANALYSIS_EXPORTING) ########### next target ############### SET(lima-lp-semanticanalysis_LIB_SRCS - ConstraintFunction.cpp SemanticRelationAnnotation.cpp SemanticRelationData.cpp SemanticRoleLabelingLoader.cpp LimaConllTokenIdMapping.cpp + ConstraintFunction.cpp + SemanticRelationAnnotation.cpp + SemanticRelationData.cpp + SemanticRoleLabelingLoader.cpp + LimaConllTokenIdMapping.cpp + SemanticRelationsXmlLogger.cpp + ConllDumper.cpp ) -add_library(lima-lp-semanticanalysis SHARED ${lima-lp-semanticanalysis_LIB_SRCS}) +if (${PYTHONLIBS_FOUND}) + SET(lima-lp-semanticanalysis_LIB_SRCS + KnowledgeBasedSemanticRoleLabeler.cpp + ${lima-lp-semanticanalysis_LIB_SRCS} + ) +endif() +DECLARE_LIMA_PLUGIN(lima-lp-semanticanalysis) -target_link_libraries(lima-lp-semanticanalysis lima-common-factory lima-common-misc lima-common-data lima-common-fsaaccess lima-common-mediaticdata lima-common-time lima-common-factory lima-common-xmlconfigurationfiles lima-common-processunitframework lima-common-mediaprocessors lima-lp-linguisticprocessors lima-lp-linguisticresources lima-lp-annotationgraph lima-lp-linguisticanalysisstructure lima-lp-textsegmentation lima-lp-syntacticanalysis lima-lp-automaton lima-lp-applyrecognizer +#add_library(lima-lp-semanticanalysis SHARED ${lima-lp-semanticanalysis_LIB_SRCS}) + + +target_link_libraries(lima-lp-semanticanalysis + lima-common-factory + lima-common-misc + lima-common-data + lima-common-fsaaccess + lima-common-mediaticdata + lima-common-time + lima-common-factory + lima-common-xmlconfigurationfiles + lima-common-processunitframework + lima-common-mediaprocessors + lima-lp-linguisticprocessors + lima-lp-linguisticresources + lima-lp-annotationgraph + lima-lp-linguisticanalysisstructure + lima-lp-textsegmentation + lima-lp-syntacticanalysis + lima-lp-automaton + lima-lp-applyrecognizer + ${optionalLibs} + ${Boost_LIBRARIES} + ${QT_LIBRARIES} ) +if (${PYTHONLIBS_FOUND}) + target_link_libraries(lima-lp-semanticanalysis + ${PYTHON_LIBRARY} + ) +endif() + set_target_properties(lima-lp-semanticanalysis PROPERTIES VERSION ${LIMA_LP_LIB_VERSION} SOVERSION ${LIMA_LP_LIB_SOVERSION}) install(TARGETS lima-lp-semanticanalysis DESTINATION lib) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp new file mode 100644 index 000000000..de0b8e766 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.cpp @@ -0,0 +1,1344 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + +/* + Copyright 2002-2014 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "ConllDumper.h" +#include "common/MediaProcessors/DumperStream.h" +#include "common/time/traceUtils.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "linguisticProcessing/core/SyntacticAnalysis/SyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticDataUtils.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" +#include "common/misc/AbstractAccessByString.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/relation.h" +#include "linguisticProcessing/core/SemanticAnalysis/LimaConllTokenIdMapping.h" + +#include +#include +#include + +#include + +using namespace Lima::Common; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; +using namespace Lima::LinguisticProcessing::SemanticAnalysis; +using namespace Lima::LinguisticProcessing::SyntacticAnalysis; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ + +namespace LinguisticProcessing +{ + +namespace AnalysisDumpers +{ + +SimpleFactory conllDumperFactory(CONLLDUMPER_CLASSID); + +class ConllDumperPrivate +{ + friend class ConllDumper; + ConllDumperPrivate(); + + virtual ~ConllDumperPrivate(); + + /** + * @brief Collect all annotation tokens corresponding to a predicate of the + * sentence starting at @ref sentenceBegin and finishing at @ref sentenceEnd + */ + QMultiMap collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd); + + MediaId m_language; + std::string m_property; + const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; + const Common::PropertyCode::PropertyManager* m_propertyManager; + const Common::PropertyCode::PropertyManager* m_timeManager; //Ajout + const Common::PropertyCode::PropertyAccessor* m_timeAccessor; //Ajout + + std::string m_graph; + std::string m_sep; + std::string m_sepPOS; + std::string m_verbTenseFlag; //Ajout + QMap m_conllLimaDepMapping; + std::string m_suffix; +}; + + +ConllDumperPrivate::ConllDumperPrivate(): +m_language(0), +m_property("MICRO"), +m_propertyAccessor(0), +m_propertyManager(0), +m_graph("PosGraph"), +m_sep(" "), +m_sepPOS("#"), +m_conllLimaDepMapping(), +m_suffix(".conll") +{ +} + +ConllDumperPrivate::~ConllDumperPrivate() +{} + +ConllDumper::ConllDumper(): +AbstractTextualAnalysisDumper(), +m_d(new ConllDumperPrivate()) +{ +} + +ConllDumper::~ConllDumper() +{ + delete m_d; +} + +void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + DUMPERLOGINIT; + AbstractTextualAnalysisDumper::init(unitConfiguration,manager); + m_d->m_language=manager->getInitializationParameters().media; + try + { + m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (NoSuchParam& ) {} // keep default value + const Common::PropertyCode::PropertyCodeManager& codeManager=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager(); + m_d->m_propertyAccessor=&codeManager.getPropertyAccessor("MICRO"); + + try + { + m_d->m_verbTenseFlag=unitConfiguration.getParamsValueAtKey("verbTenseFlag"); + } + catch (NoSuchParam& ) { + m_d->m_verbTenseFlag=std::string("False"); + } // keep default value + + try + { + m_d->m_sep=unitConfiguration.getParamsValueAtKey("sep"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_sepPOS=unitConfiguration.getParamsValueAtKey("sepPOS"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_property=unitConfiguration.getParamsValueAtKey("property"); + } + catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); + } + catch (NoSuchParam& ) {} // keep default value + m_d->m_propertyManager=&codeManager.getPropertyManager(m_d->m_property); + + m_d->m_timeManager=&codeManager.getPropertyManager("TIME"); + m_d->m_timeAccessor=&codeManager.getPropertyAccessor("TIME"); + + try { + std::string resourcePath = Common::MediaticData::MediaticData::single().getResourcesPath(); + std::string mappingFile = resourcePath + "/" + unitConfiguration.getParamsValueAtKey("mappingFile"); + std::ifstream ifs(mappingFile, std::ifstream::binary); + if (!ifs.good()) + { + LERROR << "ERROR: cannot open"+ mappingFile; + throw InvalidConfiguration(); + } + while (ifs.good() && !ifs.eof()) + { + std::string line = Lima::Common::Misc::readLine(ifs); + QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); + if (strs.size() == 2) + { + m_d->m_conllLimaDepMapping.insert(strs[0],strs[1]); + } + } + + } catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LINFO << "no parameter 'mappingFile' in ConllDumper group" << " !"; +// throw InvalidConfiguration(); + } +} + +LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; + LDEBUG << "ConllDumper::process"; +#endif + + LinguisticMetaData* metadata = static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + if (annotationData == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no AnnotationData ! abort"; + return MISSING_DATA; + } + AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph));//est de type PosGraph et non pas AnalysisGraph + if (tokenList==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process graph " << m_d->m_graph << " has not been produced: check pipeline"; + return MISSING_DATA; + } + LinguisticGraph* graph=tokenList->getGraph(); + SegmentationData* sd=static_cast(analysis.getData("SentenceBoundaries")); + if (sd==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no SentenceBoundaries! abort"; + return MISSING_DATA; + } + + SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); + if (syntacticData==0) + { + syntacticData=new SyntacticData(tokenList,0); + syntacticData->setupDependencyGraph(); + analysis.setData("SyntacticData",syntacticData); + } + const DependencyGraph* depGraph = syntacticData-> dependencyGraph(); + + QScopedPointer dstream(initialize(analysis)); + + std::map< LinguisticGraphVertex, std::pair > vertexDependencyInformations; + + uint64_t nbSentences((sd->getSegments()).size()); + if (nbSentences == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process 0 sentence to process"; + return SUCCESS_ID; + } + + std::vector::iterator sbItr=(sd->getSegments().begin()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; +#endif + LinguisticGraphVertex sentenceBegin = sbItr->getFirstVertex(); + LinguisticGraphVertex sentenceEnd = sbItr->getLastVertex(); + + + const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_d->m_language); +// for (auto im=m_d->m_conllLimaDepMapping.begin();im!=m_d->m_conllLimaDepMapping.end();im++) +// { +// LDEBUG << "("<< (*im).first<< "," << (*im).second << ")" << endl; +// } + + LimaConllTokenIdMapping* limaConllTokenIdMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); + if (limaConllTokenIdMapping == 0) + { + limaConllTokenIdMapping = new LimaConllTokenIdMapping(); + analysis.setData("LimaConllTokenIdMapping", limaConllTokenIdMapping); + } + int sentenceNb=0; + + while (sbItr != sd->getSegments().end() ) //for each sentence + { + sentenceNb++; + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + std::mapsegmentationMapping;//mapping the two types of segmentations (Lima and conll) + std::mapsegmentationMappingReverse; + +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process begin - end: " << sentenceBegin << " - " << sentenceEnd; +#endif + //LinguisticGraphOutEdgeIt outItr,outItrEnd; + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + int tokenId = 0; + LinguisticGraphVertex v = 0; + while (v != sentenceEnd && !toVisit.empty()) + + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Vertex index : " << v; +#endif + visited.insert(v); + segmentationMapping.insert(std::make_pair(v,tokenId)); + segmentationMappingReverse.insert(std::make_pair(tokenId,v)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll id : " << tokenId << " Lima id : " << v; +#endif + DependencyGraphVertex dcurrent = syntacticData->depVertexForTokenVertex(v); + DependencyGraphOutEdgeIt dit, dit_end; + boost::tie(dit,dit_end) = boost::out_edges(dcurrent,*depGraph); + for (; dit != dit_end; dit++) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Dumping dependency edge " << (*dit).m_source << " -> " << (*dit).m_target; +#endif + try + { + CEdgeDepRelTypePropertyMap typeMap = get(edge_deprel_type, *depGraph); + SyntacticRelationId type = typeMap[*dit]; + std::string syntRelName=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getSyntacticRelationName(type); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process relation = " << syntRelName; + LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); + LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); + LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; + LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); +#endif + LinguisticGraphVertex dest = syntacticData->tokenVertexForDepVertex(boost::target(*dit, *depGraph)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Targ : Morph vertex= " << dest; +#endif + if (syntRelName!="") + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process saving target for" << v << ":" << dest << syntRelName; +#endif + vertexDependencyInformations.insert(std::make_pair(v, std::make_pair(dest,syntRelName))); + } + } + catch (const std::range_error& ) + { + } + catch (...) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process: catch others....."; +#endif + throw; + } + } + if (v == sentenceEnd) + { + continue; + } + LinguisticGraphOutEdgeIt outItr,outItrEnd; + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + ++tokenId; + } + + // instead of looking to all vertices, follow the graph (in + // morphological graph, some vertices are not related to main graph: + // idiomatic expressions parts and named entity parts) + + toVisit.clear(); + visited.clear(); + + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + + // get the list of predicates for the current sentence + QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); +#ifdef DEBUG_LP + //LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; +#endif + QList< LinguisticGraphVertex > keys = predicates.keys(); + + toVisit.enqueue(sentenceBegin); + tokenId=0; + v=0; + while (!toVisit.empty() && v!=sentenceEnd) + { //as long as there are vertices in the sentence + v = toVisit.dequeue(); + + Token* ft=get(vertex_token,*graph,v); + MorphoSyntacticData* morphoData=get(vertex_data,*graph, v); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process PosGraph token" << v; +#endif + if( morphoData!=0 && !morphoData->empty() && ft != 0) + { + const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); + const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process graphTag:" << micro; +#endif + + std::string inflectedToken=ft->stringForm().toUtf8().constData(); + std::string lemmatizedToken; + if (morphoData != 0 && !morphoData->empty()) + { + lemmatizedToken=sp[(*morphoData)[0].lemma].toUtf8().constData(); + } + + QString neType = QString::fromUtf8("_") ; + std::set< AnnotationGraphVertex > anaVertices = annotationData->matches("PosGraph",v,"AnalysisGraph"); + // note: anaVertices size should be 0 or 1 + for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); + anaVerticesIt != anaVertices.end(); anaVerticesIt++) + { + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + it != matches.end(); it++) + { + AnnotationGraphVertex vx=*it; + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + const SpecificEntityAnnotation* se = + annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + neType = Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + break; + } + } + if (neType != "_") break; + } + QString conllRelName = "_"; + int targetConllId = 0; + if (vertexDependencyInformations.count(v)!=0) + { + LinguisticGraphVertex target=vertexDependencyInformations.find(v)->second.first; +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; +#endif + if (segmentationMapping.find(target) != segmentationMapping.end()) + { + targetConllId=segmentationMapping.find(target)->second; + } + else + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process target" << target << "not found in segmantation mapping"; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; +#endif + QString relName = QString::fromUtf8(vertexDependencyInformations.find(v)->second.second.c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process the lima dependency tag for " + << ft->stringForm()<< " is " << relName; +#endif + if (m_d->m_conllLimaDepMapping.contains(relName)) + { + conllRelName=m_d->m_conllLimaDepMapping[relName]; + } + else + { + conllRelName= relName; +// LERROR << "ConllDumper::process" << relName << "not found in mapping"; + } + } + // Modified CONLL-X format with an extra named entity type column + // http://ilk.uvt.nl/conll/#dataformat + // 1 ID Token counter, starting at 1 for each new sentence. + // 2 FORM Word form or punctuation symbol. + // 3 LEMMA Lemma or stem (depending on particular data set) of word form, or an underscore if not available. + // 4 CPOSTAG Coarse-grained part-of-speech tag, where tagset depends on the language. + // 5 POSTAG Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. + // 6 NER Extra column: Named entity type + // 7 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. + // 8 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + // 9 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 10 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). + // 11 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + + QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "_"; + dstream->out() << tokenId + << "\t" << inflectedToken + << "\t" << lemmatizedToken + << "\t" << macro.toUtf8().constData() + << "\t" << micro.toUtf8().constData() + << "\t" << neType.toUtf8().constData() + << "\t" << "_" + << "\t" << targetConllIdString.toUtf8().constData() + << "\t" << conllRelName.toUtf8().constData() + << "\t" << "_" + << "\t" << "_"; + if (!predicates.isEmpty()) + { + dstream->out() << "\t"; +// LDEBUG << "ConllDumper::process output the predicate if any"; + if (!predicates.contains(v)) + { + // No predicate for this token + dstream->out() << "_"; + } + else + { + // This token is a predicate, output it + QString predicateAnnotation = annotationData->stringAnnotation(predicates.value(v),"Predicate"); + dstream->out() << predicateAnnotation; + } + + // Now output the roles supported by the current PoS graph token +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process output the roles for the" << keys.size() << "predicates"; +#endif + for (int i = 0; i < keys.size(); i++) + { + // There will be one column for each predicate. Output the + // separator right now + dstream->out() << "\t"; + AnnotationGraphVertex predicateVertex = predicates.value(keys[keys.size()-1-i]); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + if (vMatches.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '_'."; +#endif + dstream->out() << "_"; + } + else + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process there is"<getGraph()); + for (; vMatchInEdgesIt != vMatchInEdgesIt_end; vMatchInEdgesIt++) + { + AnnotationGraphVertex inVertex = boost::source(*vMatchInEdgesIt, annotationData->getGraph()); + std::set< LinguisticGraphVertex > inVertexAnnotPosGraphMatches = annotationData->matches("annot",inVertex,"PosGraph"); + if (inVertex == predicateVertex && !inVertexAnnotPosGraphMatches.empty()) + { + // Current edge is holding a role of the current predicate + roleAnnotation = annotationData->stringAnnotation(*vMatchInEdgesIt,"SemanticRole"); + break; + } + else + { + // Current edge does not hold a role of the current predicate +// dstream->out() << "_"; + } + } + if (roleAnnotation != "_") break; + } + dstream->out() << roleAnnotation.toUtf8().constData(); + } + } + } + dstream->out() << std::endl; + } + + if (v == sentenceEnd) + { + continue; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process look at out edges of" << v; +#endif + LinguisticGraphOutEdgeIt outIter,outIterEnd; + for (boost::tie(outIter,outIterEnd) = boost::out_edges(v,*graph); outIter!=outIterEnd; outIter++) + { + LinguisticGraphVertex next = boost::target(*outIter,*graph); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process looking out vertex" << next; +#endif + if (!visited.contains(next)) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process enqueuing" << next; +#endif + visited.insert(next); + toVisit.enqueue(next); + } + } + tokenId++; + } + dstream->out() << std::endl; + limaConllTokenIdMapping->insert(std::make_pair(sentenceNb, segmentationMappingReverse)); + sbItr++; + } + + return SUCCESS_ID; + +} + +QMultiMap ConllDumperPrivate::collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd) +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; +#endif + QMap result; + + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + + AnalysisGraph* tokenList=static_cast(analysis.getData(m_graph)); + if (tokenList==0) { + DUMPERLOGINIT; + LERROR << "graph " << m_graph << " has not been produced: check pipeline"; + return result; + } + LinguisticGraph* graph=tokenList->getGraph(); + + + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + LinguisticGraphVertex v = 0; + while (v!=sentenceEnd && !toVisit.empty()) + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens vertex:" << v; +#endif + visited.insert(v); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + for (auto it = vMatches.begin(); it != vMatches.end(); it++) + { + AnnotationGraphVertex vMatch = *it; + if (annotationData->hasStringAnnotation(vMatch,"Predicate")) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens insert" << v << vMatch; +#endif + result.insert(v, vMatch); + } + } + LinguisticGraphOutEdgeIt outItr,outItrEnd;bool newSentence(const QString & line); + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + } + return result; +} + +} // end namespace +} // end namespace +} // end namespace + + + + + +#else + +/* + + + Copyright 2002-2014 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "ConllDumper.h" +#include "common/MediaProcessors/DumperStream.h" +#include "common/time/traceUtils.h" +#include "common/tools/FileUtils.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "linguisticProcessing/core/SyntacticAnalysis/SyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/MorphoSyntacticDataUtils.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" +#include "common/misc/AbstractAccessByString.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/ConstituantAndRelationExtractor.h" +#include "linguisticProcessing/core/AnalysisDumpers/EasyXmlDumper/relation.h" +#include "linguisticProcessing/core/SemanticAnalysis/LimaConllTokenIdMapping.h" + +#include +#include +#include + +#include + +using namespace Lima::Common; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; +using namespace Lima::LinguisticProcessing::SemanticAnalysis; +using namespace Lima::LinguisticProcessing::SyntacticAnalysis; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ + +namespace LinguisticProcessing +{ + +namespace AnalysisDumpers +{ + +SimpleFactory conllDumperFactory(CONLLDUMPER_CLASSID); + +class ConllDumperPrivate +{ + friend class ConllDumper; + ConllDumperPrivate(); + + virtual ~ConllDumperPrivate(); + + /** + * @brief Collect all annotation tokens corresponding to a predicate of the + * sentence starting at @ref sentenceBegin and finishing at @ref sentenceEnd + */ + QMultiMap collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd); + + MediaId m_language; + std::string m_property; + const Common::PropertyCode::PropertyAccessor* m_propertyAccessor; + const Common::PropertyCode::PropertyManager* m_propertyManager; + const Common::PropertyCode::PropertyManager* m_timeManager; //Ajout + const Common::PropertyCode::PropertyAccessor* m_timeAccessor; //Ajout + + std::string m_graph; + std::string m_sep; + std::string m_sepPOS; + std::string m_verbTenseFlag; //Ajout + QMap m_conllLimaDepMapping; + std::string m_suffix; +}; + + +ConllDumperPrivate::ConllDumperPrivate(): +m_language(0), +m_property("MICRO"), +m_propertyAccessor(0), +m_propertyManager(0), +m_graph("PosGraph"), +m_sep(" "), +m_sepPOS("#"), +m_conllLimaDepMapping(), +m_suffix(".conll") +{ +} + +ConllDumperPrivate::~ConllDumperPrivate() +{} + +ConllDumper::ConllDumper(): +AbstractTextualAnalysisDumper(), +m_d(new ConllDumperPrivate()) +{ +} + +ConllDumper::~ConllDumper() +{ + delete m_d; +} + +void ConllDumper::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) +{ + DUMPERLOGINIT; + AbstractTextualAnalysisDumper::init(unitConfiguration,manager); + m_d->m_language=manager->getInitializationParameters().media; + try + { + m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (NoSuchParam& ) {} // keep default value + const Common::PropertyCode::PropertyCodeManager& codeManager=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager(); + m_d->m_propertyAccessor=&codeManager.getPropertyAccessor("MICRO"); + + try + { + m_d->m_verbTenseFlag=unitConfiguration.getParamsValueAtKey("verbTenseFlag"); + } + catch (NoSuchParam& ) { + m_d->m_verbTenseFlag=std::string("False"); + } // keep default value + + try + { + m_d->m_sep=unitConfiguration.getParamsValueAtKey("sep"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_sepPOS=unitConfiguration.getParamsValueAtKey("sepPOS"); + } + catch (NoSuchParam& ) {} // keep default value + + try + { + m_d->m_property=unitConfiguration.getParamsValueAtKey("property"); + } + catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); + } + catch (NoSuchParam& ) {} // keep default value + m_d->m_propertyManager=&codeManager.getPropertyManager(m_d->m_property); + + m_d->m_timeManager=&codeManager.getPropertyManager("TIME"); + m_d->m_timeAccessor=&codeManager.getPropertyAccessor("TIME"); + + try { + std::string resourcePath = Common::MediaticData::MediaticData::single().getResourcesPath(); + std::string mappingFile = resourcePath + "/" + unitConfiguration.getParamsValueAtKey("mappingFile"); + std::ifstream ifs(mappingFile, std::ifstream::binary); + if (!ifs.good()) + { + LERROR << "ERROR: cannot open"+ mappingFile; + throw InvalidConfiguration(); + } + while (ifs.good() && !ifs.eof()) + { + std::string line = Lima::Common::Misc::readLine(ifs); + QStringList strs = QString::fromUtf8(line.c_str()).split('\t'); + if (strs.size() == 2) + { + m_d->m_conllLimaDepMapping.insert(strs[0],strs[1]); + } + } + + } catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LINFO << "no parameter 'mappingFile' in ConllDumper group" << " !"; +// throw InvalidConfiguration(); + } +} + +LimaStatusCode ConllDumper::process(AnalysisContent& analysis) const +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; + LDEBUG << "ConllDumper::process"; +#endif + + LinguisticMetaData* metadata = static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + if (annotationData == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no AnnotationData ! abort"; + return MISSING_DATA; + } + AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph));//est de type PosGraph et non pas AnalysisGraph + if (tokenList==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process graph " << m_d->m_graph << " has not been produced: check pipeline"; + return MISSING_DATA; + } + LinguisticGraph* graph=tokenList->getGraph(); + SegmentationData* sd=static_cast(analysis.getData("SentenceBoundaries")); + if (sd==0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process no SentenceBoundaries! abort"; + return MISSING_DATA; + } + + SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); + if (syntacticData==0) + { + syntacticData=new SyntacticData(tokenList,0); + syntacticData->setupDependencyGraph(); + analysis.setData("SyntacticData",syntacticData); + } + const DependencyGraph* depGraph = syntacticData-> dependencyGraph(); + + QScopedPointer dstream(initialize(analysis)); + + std::map< LinguisticGraphVertex, std::pair > vertexDependencyInformations; + + uint64_t nbSentences((sd->getSegments()).size()); + if (nbSentences == 0) + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process 0 sentence to process"; + return SUCCESS_ID; + } + + std::vector::iterator sbItr=(sd->getSegments().begin()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process There are "<< nbSentences << " sentences"; +#endif + LinguisticGraphVertex sentenceBegin = sbItr->getFirstVertex(); + LinguisticGraphVertex sentenceEnd = sbItr->getLastVertex(); + + + const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_d->m_language); +// for (auto im=m_d->m_conllLimaDepMapping.begin();im!=m_d->m_conllLimaDepMapping.end();im++) +// { +// LDEBUG << "("<< (*im).first<< "," << (*im).second << ")" << endl; +// } + + LimaConllTokenIdMapping* limaConllTokenIdMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); + if (limaConllTokenIdMapping == 0) + { + limaConllTokenIdMapping = new LimaConllTokenIdMapping(); + analysis.setData("LimaConllTokenIdMapping", limaConllTokenIdMapping); + } + int sentenceNb=0; + + while (sbItr != sd->getSegments().end() ) //for each sentence + { + sentenceNb++; + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + std::mapsegmentationMapping;//mapping the two types of segmentations (Lima and conll) + std::mapsegmentationMappingReverse; + +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process begin - end: " << sentenceBegin << " - " << sentenceEnd; +#endif + //LinguisticGraphOutEdgeIt outItr,outItrEnd; + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + int tokenId = 0; + LinguisticGraphVertex v = 0; + while (v != sentenceEnd && !toVisit.empty()) + + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Vertex index : " << v; +#endif + visited.insert(v); + segmentationMapping.insert(std::make_pair(v,tokenId)); + segmentationMappingReverse.insert(std::make_pair(tokenId,v)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll id : " << tokenId << " Lima id : " << v; +#endif + DependencyGraphVertex dcurrent = syntacticData->depVertexForTokenVertex(v); + DependencyGraphOutEdgeIt dit, dit_end; + boost::tie(dit,dit_end) = boost::out_edges(dcurrent,*depGraph); + for (; dit != dit_end; dit++) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Dumping dependency edge " << (*dit).m_source << " -> " << (*dit).m_target; +#endif + try + { + CEdgeDepRelTypePropertyMap typeMap = get(edge_deprel_type, *depGraph); + SyntacticRelationId type = typeMap[*dit]; + std::string syntRelName=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getSyntacticRelationName(type); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process relation = " << syntRelName; + LDEBUG << "ConllDumper::process Src : Dep vertex= " << boost::source(*dit, *depGraph); + LinguisticGraphVertex src = syntacticData->tokenVertexForDepVertex(boost::source(*dit, *depGraph)); + LDEBUG << "ConllDumper::process Src : Morph vertex= " << src; + LDEBUG << "ConllDumper::process Targ : Dep vertex= " << boost::target(*dit, *depGraph); +#endif + LinguisticGraphVertex dest = syntacticData->tokenVertexForDepVertex(boost::target(*dit, *depGraph)); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process Targ : Morph vertex= " << dest; +#endif + if (syntRelName!="") + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process saving target for" << v << ":" << dest << syntRelName; +#endif + vertexDependencyInformations.insert(std::make_pair(v, std::make_pair(dest,syntRelName))); + } + } + catch (const std::range_error& ) + { + } + catch (...) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process: catch others....."; +#endif + throw; + } + } + if (v == sentenceEnd) + { + continue; + } + LinguisticGraphOutEdgeIt outItr,outItrEnd; + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + ++tokenId; + } + + // instead of looking to all vertices, follow the graph (in + // morphological graph, some vertices are not related to main graph: + // idiomatic expressions parts and named entity parts) + + toVisit.clear(); + visited.clear(); + + sentenceBegin=sbItr->getFirstVertex(); + sentenceEnd=sbItr->getLastVertex(); + + // get the list of predicates for the current sentence + QMultiMap predicates = m_d->collectPredicateTokens( analysis, sentenceBegin, sentenceEnd ); +#ifdef DEBUG_LP + //LDEBUG << "ConllDumper::process predicates for sentence between" << sentenceBegin << "and" << sentenceEnd << "are:" << predicates; +#endif + QList< LinguisticGraphVertex > keys = predicates.keys(); + + toVisit.enqueue(sentenceBegin); + tokenId=0; + v=0; + while (!toVisit.empty() && v!=sentenceEnd) + { //as long as there are vertices in the sentence + v = toVisit.dequeue(); + + Token* ft=get(vertex_token,*graph,v); + MorphoSyntacticData* morphoData=get(vertex_data,*graph, v); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process PosGraph token" << v; +#endif + if( morphoData!=0 && !morphoData->empty() && ft != 0) + { + const QString macro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); + const QString micro=QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_d->m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(*m_d->m_propertyAccessor)).c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process graphTag:" << micro; +#endif + + std::string inflectedToken=ft->stringForm().toUtf8().constData(); + std::string lemmatizedToken; + if (morphoData != 0 && !morphoData->empty()) + { + lemmatizedToken=sp[(*morphoData)[0].lemma].toUtf8().constData(); + } + + QString neType = QString::fromUtf8("_") ; + std::set< AnnotationGraphVertex > anaVertices = annotationData->matches("PosGraph",v,"AnalysisGraph"); + // note: anaVertices size should be 0 or 1 + for (std::set< AnnotationGraphVertex >::const_iterator anaVerticesIt = anaVertices.begin(); + anaVerticesIt != anaVertices.end(); anaVerticesIt++) + { + std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",*anaVerticesIt,"annot"); + for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + it != matches.end(); it++) + { + AnnotationGraphVertex vx=*it; + if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + const SpecificEntityAnnotation* se = + annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). + pointerValue(); + neType = Common::MediaticData::MediaticData::single().getEntityName(se->getType()); + break; + } + } + if (neType != "_") break; + } + QString conllRelName = "_"; + int targetConllId = 0; + if (vertexDependencyInformations.count(v)!=0) + { + LinguisticGraphVertex target=vertexDependencyInformations.find(v)->second.first; +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process target saved for" << v << "is" << target; +#endif + if (segmentationMapping.find(target) != segmentationMapping.end()) + { + targetConllId=segmentationMapping.find(target)->second; + } + else + { + DUMPERLOGINIT; + LERROR << "ConllDumper::process target" << target << "not found in segmantation mapping"; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process conll target saved for " << tokenId << " is " << targetConllId; +#endif + QString relName = QString::fromUtf8(vertexDependencyInformations.find(v)->second.second.c_str()); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process the lima dependency tag for " + << ft->stringForm()<< " is " << relName; +#endif + if (m_d->m_conllLimaDepMapping.contains(relName)) + { + conllRelName=m_d->m_conllLimaDepMapping[relName]; + } + else + { + conllRelName= relName; +// LERROR << "ConllDumper::process" << relName << "not found in mapping"; + } + } + // Modified CONLL-X format with an extra named entity type column + // http://ilk.uvt.nl/conll/#dataformat + // 1 ID Token counter, starting at 1 for each new sentence. + // 2 FORM Word form or punctuation symbol. + // 3 LEMMA Lemma or stem (depending on particular data set) of word form, or an underscore if not available. + // 4 CPOSTAG Coarse-grained part-of-speech tag, where tagset depends on the language. + // 5 POSTAG Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. + // 6 NER Extra column: Named entity type + // 7 FEATS Unordered set of syntactic and/or morphological features (depending on the particular language), separated by a vertical bar (|), or an underscore if not available. + // 8 HEAD Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + // 9 DEPREL Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + // 10 PHEAD Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. Note that depending on the original treebank annotation, there may be multiple tokens an with ID of zero. The dependency structure resulting from the PHEAD column is guaranteed to be projective (but is not available for all languages), whereas the structures resulting from the HEAD column will be non-projective for some sentences of some languages (but is always available). + // 11 PDEPREL Dependency relation to the PHEAD, or an underscore if not available. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningfull or simply 'ROOT'. + + QString targetConllIdString = targetConllId > 0 ? QString("%1").arg(targetConllId) : "_"; + dstream->out() << tokenId + << "\t" << inflectedToken + << "\t" << lemmatizedToken + << "\t" << macro.toUtf8().constData() + << "\t" << micro.toUtf8().constData() + << "\t" << neType.toUtf8().constData() + << "\t" << "_" + << "\t" << targetConllIdString.toUtf8().constData() + << "\t" << conllRelName.toUtf8().constData() + << "\t" << "_" + << "\t" << "_"; + if (!predicates.isEmpty()) + { + dstream->out() << "\t"; +// LDEBUG << "ConllDumper::process output the predicate if any"; + if (!predicates.contains(v)) + { + // No predicate for this token + dstream->out() << "_"; + } + else + { + // This token is a predicate, output it + QString predicateAnnotation = annotationData->stringAnnotation(predicates.value(v),"Predicate"); + dstream->out() << predicateAnnotation; + } + + // Now output the roles supported by the current PoS graph token +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process output the roles for the" << keys.size() << "predicates"; +#endif + for (int i = 0; i < keys.size(); i++) + { + // There will be one column for each predicate. Output the + // separator right now + dstream->out() << "\t"; + AnnotationGraphVertex predicateVertex = predicates.value(keys[keys.size()-1-i]); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + if (vMatches.empty()) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process no node matching PoS graph vertex" << v << "in the annotation graph. Output '_'."; +#endif + dstream->out() << "_"; + } + else + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process there is"<getGraph()); + for (; vMatchInEdgesIt != vMatchInEdgesIt_end; vMatchInEdgesIt++) + { + AnnotationGraphVertex inVertex = boost::source(*vMatchInEdgesIt, annotationData->getGraph()); + std::set< LinguisticGraphVertex > inVertexAnnotPosGraphMatches = annotationData->matches("annot",inVertex,"PosGraph"); + if (inVertex == predicateVertex && !inVertexAnnotPosGraphMatches.empty()) + { + // Current edge is holding a role of the current predicate + roleAnnotation = annotationData->stringAnnotation(*vMatchInEdgesIt,"SemanticRole"); + break; + } + else + { + // Current edge does not hold a role of the current predicate +// dstream->out() << "_"; + } + } + if (roleAnnotation != "_") break; + } + dstream->out() << roleAnnotation.toUtf8().constData(); + } + } + } + dstream->out() << std::endl; + } + + if (v == sentenceEnd) + { + continue; + } +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process look at out edges of" << v; +#endif + LinguisticGraphOutEdgeIt outIter,outIterEnd; + for (boost::tie(outIter,outIterEnd) = boost::out_edges(v,*graph); outIter!=outIterEnd; outIter++) + { + LinguisticGraphVertex next = boost::target(*outIter,*graph); +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process looking out vertex" << next; +#endif + if (!visited.contains(next)) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumper::process enqueuing" << next; +#endif + visited.insert(next); + toVisit.enqueue(next); + } + } + tokenId++; + } + dstream->out() << std::endl; + limaConllTokenIdMapping->insert(std::make_pair(sentenceNb, segmentationMappingReverse)); + sbItr++; + } + + return SUCCESS_ID; + +} + +QMultiMap ConllDumperPrivate::collectPredicateTokens( + Lima::AnalysisContent& analysis, LinguisticGraphVertex sentenceBegin, LinguisticGraphVertex sentenceEnd) +{ +#ifdef DEBUG_LP + DUMPERLOGINIT; +#endif + QMap result; + + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + + AnalysisGraph* tokenList=static_cast(analysis.getData(m_graph)); + if (tokenList==0) { + DUMPERLOGINIT; + LERROR << "graph " << m_graph << " has not been produced: check pipeline"; + return result; + } + LinguisticGraph* graph=tokenList->getGraph(); + + + QQueue toVisit; + QSet visited; + toVisit.enqueue(sentenceBegin); + LinguisticGraphVertex v = 0; + while (v!=sentenceEnd && !toVisit.empty()) + { + v = toVisit.dequeue(); +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens vertex:" << v; +#endif + visited.insert(v); + + std::set< AnnotationGraphVertex > vMatches = annotationData->matches("PosGraph", v, "annot"); + for (auto it = vMatches.begin(); it != vMatches.end(); it++) + { + AnnotationGraphVertex vMatch = *it; + if (annotationData->hasStringAnnotation(vMatch,"Predicate")) + { +#ifdef DEBUG_LP + LDEBUG << "ConllDumperPrivate::collectPredicateTokens insert" << v << vMatch; +#endif + result.insert(v, vMatch); + } + } + LinguisticGraphOutEdgeIt outItr,outItrEnd;bool newSentence(const QString & line); + for (boost::tie(outItr,outItrEnd)=boost::out_edges(v,*graph); outItr!=outItrEnd; outItr++) + { + LinguisticGraphVertex next=boost::target(*outItr,*graph); + if (!visited.contains(next) && next != tokenList->lastVertex()) + { + toVisit.enqueue(next); + } + } + } + return result; +} + +} // end namespace +} // end namespace +} // end namespace + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h similarity index 92% rename from lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h rename to lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h index 86ec375dd..6ca27be3b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/AnalysisDumpers/ConllDumper.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConllDumper.h @@ -19,7 +19,7 @@ #ifndef LIMA_LINGUISTICPROCESSING_ANALYSISDUMPERSTEXTDUMPER_H #define LIMA_LINGUISTICPROCESSING_ANALYSISDUMPERSTEXTDUMPER_H -#include "AnalysisDumpersExport.h" +#include "SemanticAnalysisExport.h" #include "linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h" namespace Lima @@ -36,7 +36,7 @@ class ConllDumperPrivate; /** @author Gael de Chalendar */ -class LIMA_ANALYSISDUMPERS_EXPORT ConllDumper : public AbstractTextualAnalysisDumper +class LIMA_SEMANTICANALYSIS_EXPORT ConllDumper : public AbstractTextualAnalysisDumper { public: ConllDumper(); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp index 159c8c4e6..c969eaaf4 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/ConstraintFunction.cpp @@ -70,7 +70,9 @@ ConstraintFunction(language,complement) bool ClearSemanticRelation::operator()(AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "ClearSemanticRelation::operator()"; +#endif SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) { @@ -91,7 +93,9 @@ SaveSemanticRelation::SaveSemanticRelation(MediaId language, bool SaveSemanticRelation::operator()(AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "SaveSemanticRelation::operator()"; +#endif SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) { @@ -117,7 +121,9 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& anagraph, AnalysisContent& analysis ) const { SEMLOGINIT; +#ifdef DEBUG_LP LDEBUG << "CreateSemanticRelation::operator()" << vertex1 << vertex2 << m_semanticRelationType; +#endif LIMA_UNUSED(anagraph); SemanticRelationData * semanticData=static_cast(analysis.getData("SemanticRelationData")); if (semanticData==0) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp new file mode 100644 index 000000000..31b3eccd0 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.cpp @@ -0,0 +1,925 @@ + +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +// Ici sinon compile pas +#include + +#include "KnowledgeBasedSemanticRoleLabeler.h" + +#include "common/Data/LimaString.h" +#include "common/misc/Exceptions.h" +#include "common/Data/strwstrtools.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/SemanticAnalysis/ConllDumper.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +//#include "common/tools/FileUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" + +#include + + +#include + + + +using namespace std; +using namespace Lima::LinguisticProcessing::AnalysisDumpers; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::Misc; + + +#define HANDLE_ERROR(Y,Z) if ( Y ) Z ; +#define HANDLE_ERROR_EQUAL(X,Y,Z) if ( X == Y ) Z ; +#define HANDLE_ERROR_RETURN(X,Y,Z) if ( X ) { Y ; return Z; } +#define HANDLE_ERROR_EQUAL_RETURN(X,Y,Z,R) if ( X == Y ) { Z ; return R ; } +#define HANDLE_ERROR_DIFFERENT(X,Y,Z) if ( X != Y ) Z ; +#define HANDLE_ERROR_DIFFERENT_RETURN(X,Y,Z,R) if ( X != Y ) { Z ; return R ; } + + + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +static SimpleFactory knowledgeBasedSemanticRoleLabelerFactory(KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID); + + +class KnowledgeBasedSemanticRoleLabelerPrivate +{ +public: + KnowledgeBasedSemanticRoleLabelerPrivate(); + virtual ~KnowledgeBasedSemanticRoleLabelerPrivate(); + + PyObject* m_instance; + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_inputSuffix; + QString m_outputSuffix; + QString m_temporaryFileMetadata; +}; + +KnowledgeBasedSemanticRoleLabelerPrivate::KnowledgeBasedSemanticRoleLabelerPrivate() : + m_instance(0), + m_dumper(new ConllDumper()) +{} + +KnowledgeBasedSemanticRoleLabelerPrivate::~KnowledgeBasedSemanticRoleLabelerPrivate() +{ +} + +KnowledgeBasedSemanticRoleLabeler::KnowledgeBasedSemanticRoleLabeler() : m_d(new KnowledgeBasedSemanticRoleLabelerPrivate()) +{} + + +KnowledgeBasedSemanticRoleLabeler::~KnowledgeBasedSemanticRoleLabeler() +{ + delete m_d; +} + +auto failed_to_import_the_sys_module = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import the sys module"; + PyErr_Print(); +}; + +auto cannot_instantiate_the_semanticrolelabeler_python_class = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; + PyErr_Print(); + Py_Exit(1); +}; + +void KnowledgeBasedSemanticRoleLabeler::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ +#ifdef DEBUG_LP + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler::init"; +#endif + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_d->m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'dumper' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_d->m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_d->m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'loader' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value (empty) + } + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + try { + m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + } + QString path; + QString mode = "VerbNet"; + QString kbsrlLogLevel = "error"; + + try + { + kbsrlLogLevel = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("loglevel").c_str()); + } + catch (NoSuchParam& ) + { + // keep default + } + + try + { + path = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("path").c_str()); + } + catch (NoSuchParam& ) + { + SEMANTICANALYSISLOGINIT; + LERROR << "no param 'path' in KnowledgeBasedSemanticRoleLabeler group configuration"; + throw InvalidConfiguration(); + } + + try + { + mode = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("mode").c_str()); + if (mode != "VerbNet" && mode != "FrameNet") + { + SEMANTICANALYSISLOGINIT; + LERROR << "Unknown semantic annotation mode" << mode; + throw InvalidConfiguration(); + } + } + catch (NoSuchParam& ) + { + // keep default + } + + // Initialize the python SRL system + /* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); +#ifdef ANTINNO_ASFALDA + str_program_name = "c://python24//python.exe"; + wchar_t* pythonPath = L"c://python24//python.exe"; + //Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); + // Erreur de link que je pige pas, je triche... + Py_SetProgramName(pythonPath); +#else + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } +#ifndef WIN32 + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif +#endif + + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + HANDLE_ERROR_EQUAL (sys_module, NULL, failed_to_import_the_sys_module() ); + + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("D:/telechargement Amose/knowledgesrl/src")) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to append to python path"; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import srl semanticrolelabeler module"; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[sss]", + QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData(), + QString("--frame-lexicon=%1").arg(mode).toUtf8().constData(), + QString("--language=%1").arg(Lima::Common::MediaticData::MediaticData::single().getMediaId(language).c_str()).toUtf8().constData()); + HANDLE_ERROR_EQUAL(m_d->m_instance,NULL,cannot_instantiate_the_semanticrolelabeler_python_class()) +} + +auto metadata_equal_zero = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; +}; + +auto temporary_file_not_open = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to create temporary file"; +}; + +auto temporary_file_srl_not_open = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file for dumping SRL CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); +}; + +auto failed_to_load_data_from_temporary_file = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file" << temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." << temporaryFile->fileName(); + temporaryFile->setAutoRemove(false); +}; + +auto failure_during_call_of_the_annotate_method_on = [](QString& conllInput) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failure during call of the annotate method on" << conllInput; + PyErr_Print(); + Py_Exit(1); +}; + +LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( + AnalysisContent& analysis) const +{ + TimeUtilsController knowledgeBasedSemanticRoleLabelerProcessTime("KnowledgeBasedSemanticRoleLabeler"); + SEMANTICANALYSISLOGINIT; + LINFO << "start SRL process"; + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + HANDLE_ERROR_EQUAL_RETURN(metadata,0,metadata_equal_zero(),MISSING_DATA) + + QScopedPointer temporaryFile; + if (!m_d->m_temporaryFileMetadata.isEmpty()) + { + QScopedPointer otherTemp(new QTemporaryFile()); + temporaryFile.swap(otherTemp); + HANDLE_ERROR_RETURN(!temporaryFile->open(),temporary_file_not_open(),CANNOT_OPEN_FILE_ERROR); + metadata->setMetaData(m_d->m_temporaryFileMetadata.toUtf8().constData(), + temporaryFile->fileName().toUtf8().constData()); + } + + // Use CoNLL duper to produce the input to the SRL + LimaStatusCode returnCode(SUCCESS_ID); + returnCode=m_d->m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to dump data to temporary file"; + return returnCode; + } + + QString conllInput; + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename; + if (!m_d->m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_d->m_inputSuffix; + } + QFile inputFile(inputFilename); + inputFile.open(QIODevice::ReadOnly); + conllInput = QString::fromUtf8(inputFile.readAll().constData()); + inputFile.close(); + } + else + { + if (!temporaryFile->open()) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file after dumping CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return CANNOT_OPEN_FILE_ERROR; + } + conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); +#ifdef DEBUG_LP + temporaryFile->setAutoRemove(false); + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler: keeping temporary file after dumping CoNLL data to it for debugging"<< temporaryFile->fileName(); +#endif + temporaryFile->close(); + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "s", conllInput.toUtf8().constData()); + HANDLE_ERROR_EQUAL(callResult, NULL, failure_during_call_of_the_annotate_method_on(conllInput)); + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + LERROR << "Cannot convert result item to string"; + PyErr_Print(); + Py_Exit(1); + } + LDEBUG << "Python result is:" << result; + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString outputFilename; + if (!m_d->m_outputSuffix.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + outputFilename = fileName + m_d->m_outputSuffix; + } + QFile outputFile(outputFilename); + outputFile.open(QIODevice::WriteOnly); + outputFile.write(result); + outputFile.close(); + } + else + { + HANDLE_ERROR_RETURN( !temporaryFile->open(), + temporary_file_srl_not_open(temporaryFile), CANNOT_OPEN_FILE_ERROR); + if (!temporaryFile->seek(0)) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to seek to the beginning of temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + if (temporaryFile->write(result) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to write SRL result to temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + temporaryFile->close(); + } + Py_DECREF(callResult); + // Import the CoNLL result + returnCode=m_d->m_loader->process(analysis); + HANDLE_ERROR_DIFFERENT_RETURN(returnCode,SUCCESS_ID,failed_to_load_data_from_temporary_file(temporaryFile),returnCode) + + + return returnCode; +} + +} //namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima + + + + + + + +#else + + + + + + + +// version master + +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include "KnowledgeBasedSemanticRoleLabeler.h" + +#include "common/Data/LimaString.h" +#include "common/misc/Exceptions.h" +#include "common/Data/strwstrtools.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/SemanticAnalysis/ConllDumper.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticProcessors/LimaStringText.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" +#include "common/tools/FileUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/time/timeUtilsController.h" + +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::AnalysisDumpers; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::Misc; + + +#define HANDLE_ERROR(Y,Z) if ( Y ) Z ; +#define HANDLE_ERROR_EQUAL(X,Y,Z) if ( X == Y ) Z ; +#define HANDLE_ERROR_RETURN(X,Y,Z) if ( X ) { Y ; return Z; } +#define HANDLE_ERROR_EQUAL_RETURN(X,Y,Z,R) if ( X == Y ) { Z ; return R ; } +#define HANDLE_ERROR_DIFFERENT(X,Y,Z) if ( X != Y ) Z ; +#define HANDLE_ERROR_DIFFERENT_RETURN(X,Y,Z,R) if ( X != Y ) { Z ; return R ; } + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +static SimpleFactory knowledgeBasedSemanticRoleLabelerFactory(KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID); + + +class KnowledgeBasedSemanticRoleLabelerPrivate +{ +public: + KnowledgeBasedSemanticRoleLabelerPrivate(); + virtual ~KnowledgeBasedSemanticRoleLabelerPrivate(); + + PyObject* m_instance; + const MediaProcessUnit* m_dumper; + const MediaProcessUnit* m_loader; + QString m_inputSuffix; + QString m_outputSuffix; + QString m_temporaryFileMetadata; +}; + +KnowledgeBasedSemanticRoleLabelerPrivate::KnowledgeBasedSemanticRoleLabelerPrivate() : + m_instance(0), + m_dumper(new ConllDumper()) +{} + +KnowledgeBasedSemanticRoleLabelerPrivate::~KnowledgeBasedSemanticRoleLabelerPrivate() +{ +} + +KnowledgeBasedSemanticRoleLabeler::KnowledgeBasedSemanticRoleLabeler() : m_d(new KnowledgeBasedSemanticRoleLabelerPrivate()) +{} + + +KnowledgeBasedSemanticRoleLabeler::~KnowledgeBasedSemanticRoleLabeler() +{ + delete m_d; +} + +auto failed_to_import_the_sys_module = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to import the sys module"; + PyErr_Print(); +}; + +auto cannot_instantiate_the_semanticrolelabeler_python_class = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Cannot instantiate the SemanticRoleLabeler python class"; + PyErr_Print(); + Py_Exit(1); +}; + +void KnowledgeBasedSemanticRoleLabeler::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ +#ifdef DEBUG_LP + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler::init"; +#endif + + MediaId language=manager->getInitializationParameters().media; + try { + string dumperName=unitConfiguration.getParamsValueAtKey("dumper"); + // create the dumper + m_d->m_dumper=manager->getObject(dumperName); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'dumper' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + string loaderName=unitConfiguration.getParamsValueAtKey("loader"); + // create the loader + m_d->m_loader=manager->getObject(loaderName); + } + catch (InvalidConfiguration& ) { + m_d->m_loader = 0; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'loader' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_temporaryFileMetadata = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("temporaryFileMetadata").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + // optional parameter: keep default value (empty) + } + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + try { + m_d->m_inputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("inputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'inputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + + try { + m_d->m_outputSuffix=QString::fromUtf8(unitConfiguration.getParamsValueAtKey("outputSuffix").c_str()); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) { + SEMANTICANALYSISLOGINIT; + LERROR << "Missing 'outputSuffix' parameter in KnowledgeBasedSemanticRoleLabeler group for language " + << (int)language << " !"; + throw InvalidConfiguration(); + } + } + QString kbsrlLogLevel = "error"; + try + { + kbsrlLogLevel = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("loglevel").c_str()); + } + catch (NoSuchParam& ) + { + // keep default + } + + QString path; + try + { + path = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("path").c_str()); + } + catch (NoSuchParam& ) + { + SEMANTICANALYSISLOGINIT; + LERROR << "no param 'path' in KnowledgeBasedSemanticRoleLabeler group configuration"; + throw InvalidConfiguration(); + } + + QString mode = "VerbNet"; + try + { + mode = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("mode").c_str()); + if (mode != "VerbNet" && mode != "FrameNet") + { + SEMANTICANALYSISLOGINIT; + LERROR << "Unknown semantic annotation mode" << mode; + throw InvalidConfiguration(); + } + } + catch (NoSuchParam& ) + { + // keep default + } + + // Initialize the python SRL system + /* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } +#ifndef WIN32 + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif + + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + HANDLE_ERROR_EQUAL (sys_module, NULL, failed_to_import_the_sys_module() ); + + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault(path.toUtf8().constData())) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Failed to append to python path"; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler::init"<< __FILE__ << __LINE__ << ": Failed to import srl semanticrolelabeler module"; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + m_d->m_instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[sss]", + QString("--log=%1").arg(kbsrlLogLevel).toUtf8().constData(), + QString("--frame-lexicon=%1").arg(mode).toUtf8().constData(), + QString("--language=%1").arg(Lima::Common::MediaticData::MediaticData::single().getMediaId(language).c_str()).toUtf8().constData()); + HANDLE_ERROR_EQUAL(m_d->m_instance,NULL,cannot_instantiate_the_semanticrolelabeler_python_class()) +} + +auto metadata_equal_zero = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; +}; + +auto temporary_file_not_open = []() +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to create temporary file"; +}; + +auto temporary_file_srl_not_open = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file for dumping SRL CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); +}; + +auto failed_to_load_data_from_temporary_file = [](QScopedPointer& temporaryFile) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to load data from temporary file" << temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." << temporaryFile->fileName(); + temporaryFile->setAutoRemove(false); +}; + +auto failure_during_call_of_the_annotate_method_on = [](QString& conllInput) +{ + SEMANTICANALYSISLOGINIT; + LERROR << "Failure during call of the annotate method on" << conllInput; + PyErr_Print(); + Py_Exit(1); +}; + +LimaStatusCode KnowledgeBasedSemanticRoleLabeler::process( + AnalysisContent& analysis) const +{ + TimeUtilsController knowledgeBasedSemanticRoleLabelerProcessTime("KnowledgeBasedSemanticRoleLabeler"); +#ifdef DEBUG_LP + SEMANTICANALYSISLOGINIT; + LINFO << "start SRL process"; +#endif + + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + HANDLE_ERROR_EQUAL_RETURN(metadata,0,metadata_equal_zero(),MISSING_DATA) + + QScopedPointer temporaryFile; + if (!m_d->m_temporaryFileMetadata.isEmpty()) + { + QScopedPointer otherTemp(new QTemporaryFile()); + temporaryFile.swap(otherTemp); + HANDLE_ERROR_RETURN(!temporaryFile->open(),temporary_file_not_open(),CANNOT_OPEN_FILE_ERROR); + metadata->setMetaData(m_d->m_temporaryFileMetadata.toUtf8().constData(), + temporaryFile->fileName().toUtf8().constData()); + } + + // Use CoNLL duper to produce the input to the SRL + LimaStatusCode returnCode(SUCCESS_ID); + returnCode=m_d->m_dumper->process(analysis); + if (returnCode!=SUCCESS_ID) { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: failed to dump data to temporary file"; + return returnCode; + } + + QString conllInput; + + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + QString inputFilename; + if (!m_d->m_inputSuffix.isEmpty()) + { + inputFilename = fileName+ m_d->m_inputSuffix; + } + QFile inputFile(inputFilename); + inputFile.open(QIODevice::ReadOnly); + conllInput = QString::fromUtf8(inputFile.readAll().constData()); + inputFile.close(); + } + else + { + if (!temporaryFile->open()) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to open temporary file after dumping CoNLL data to it"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return CANNOT_OPEN_FILE_ERROR; + } + conllInput = QString::fromUtf8(temporaryFile->readAll().constData()); +#ifdef DEBUG_LP + temporaryFile->setAutoRemove(false); + SEMANTICANALYSISLOGINIT; + LDEBUG << "KnowledgeBasedSemanticRoleLabeler: keeping temporary file after dumping CoNLL data to it for debugging"<< temporaryFile->fileName(); +#endif + temporaryFile->close(); + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(m_d->m_instance, "annotate", "ss", + conllInput.toUtf8().constData(), + metadata->getMetaData("Lang").c_str() + ); + HANDLE_ERROR_EQUAL(callResult, NULL, failure_during_call_of_the_annotate_method_on(conllInput)); + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + SEMANTICANALYSISLOGINIT; + LERROR << "Cannot convert result item to string"; + PyErr_Print(); + Py_Exit(1); + } +#ifdef DEBUG_LP + LDEBUG << "Python result is:" << result; +#endif + if (m_d->m_temporaryFileMetadata.isEmpty()) + { + QString outputFilename; + if (!m_d->m_outputSuffix.isEmpty()) + { + QString fileName = QString::fromUtf8(metadata->getMetaData("FileName").c_str()); + outputFilename = fileName + m_d->m_outputSuffix; + } + QFile outputFile(outputFilename); + outputFile.open(QIODevice::WriteOnly); + outputFile.write(result); + outputFile.close(); + } + else + { + HANDLE_ERROR_RETURN( !temporaryFile->open(), + temporary_file_srl_not_open(temporaryFile), CANNOT_OPEN_FILE_ERROR); + if (!temporaryFile->seek(0)) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to seek to the beginning of temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + if (temporaryFile->write(result) == -1) + { + SEMANTICANALYSISLOGINIT; + LERROR << "KnowledgeBasedSemanticRoleLabeler: unable to write SRL result to temporary file"<< temporaryFile->fileName(); + LERROR << "KnowledgeBasedSemanticRoleLabeler: keep (do not auto remove) it for debug purpose." ; + temporaryFile->setAutoRemove(false); + return UNKNOWN_ERROR; + } + temporaryFile->close(); + } + Py_DECREF(callResult); + // Import the CoNLL result + returnCode=m_d->m_loader->process(analysis); + HANDLE_ERROR_DIFFERENT_RETURN(returnCode,SUCCESS_ID,failed_to_load_data_from_temporary_file(temporaryFile),returnCode) + + return returnCode; +} + +} //namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima + + + + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h new file mode 100644 index 000000000..f8927c377 --- /dev/null +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/KnowledgeBasedSemanticRoleLabeler.h @@ -0,0 +1,74 @@ +/* + Copyright 2016 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#ifndef LIMA_LINGUISTICPROCESSING_SEMANTICANALYSIS_KNOWLEDGEBASEDSEMANTICROLELABELER_H +#define LIMA_LINGUISTICPROCESSING_SEMANTICANALYSIS_KNOWLEDGEBASEDSEMANTICROLELABELER_H + +#include "SemanticAnalysisExport.h" +#include "common/MediaProcessors/MediaProcessUnit.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +#define KNOWLEDGEBASEDSEMANTICROLELABELER_CLASSID "KnowledgeBasedSemanticRoleLabeler" + +class KnowledgeBasedSemanticRoleLabelerPrivate; + +/** @brief This is a @ref MediaProcessUnit which do semantic role labeling using the knowledge-based + * SRL in python made by Quentin Pradet during his PhD thesis + * + * As a ProcessUnit, it has an init and a process function. See @ref ProcessUnit for details. + * + * IOPES: + * - Input: an AnalysisContent and the following parameters in the configuration file: + * - debug: whether the debug option of the python module should be activated or not + * - path: the path to the knowledgesrl python package + * - mode: the semantic model to use to annotate. Either VerbNet (default) or FrameNet. + * - Output: an AnalysisContent + * - Preconditions: the AnalysisContent must the result of the syntactic analysis + * - Effects: the annotation graph will be updated with SRL annotations. + */ +class LIMA_SEMANTICANALYSIS_EXPORT KnowledgeBasedSemanticRoleLabeler : public MediaProcessUnit +{ + +public: + KnowledgeBasedSemanticRoleLabeler(); + virtual ~KnowledgeBasedSemanticRoleLabeler(); + + void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + LimaStatusCode process(AnalysisContent& analysis) const; + + private: + + KnowledgeBasedSemanticRoleLabelerPrivate* m_d; +}; + +} // namespace SemanticAnalysis +} // namespace LinguisticProcessing +} // namespace Lima + +#endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp index ab0857772..535831b19 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.cpp @@ -1,3 +1,14 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + + + /* Copyright 2002-2013 CEA LIST @@ -25,15 +36,314 @@ * ***********************************************************************/ + +#include "SemanticRelationsXmlLogger.h" +#include "SemanticRelationAnnotation.h" +#include "SemanticAnnotation.h" + +#include "common/MediaticData/mediaticData.h" +#include "common/Data/strwstrtools.h" +#include "common/time/traceUtils.h" +#include "common/AbstractFactoryPattern/SimpleFactory.h" +//#include "common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" +#include "linguisticProcessing/core/Automaton/SpecificEntityAnnotation.h" + +#include + +#define SEMLOGINIT LOGINIT("LP::SemanticAnalysis") + +using namespace std; +using namespace boost; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SpecificEntities; + +namespace Lima { +namespace LinguisticProcessing { +namespace SemanticAnalysis { + +SimpleFactory +semanticRelationsXmlLoggerFactory(SEMANTICRELATIONSXMLLOGGER_CLASSID); + +SemanticRelationsXmlLogger::SemanticRelationsXmlLogger() : +AbstractLinguisticLogger(".output.xml"), +m_language(0), +m_graph("PosGraph") +{} + + +SemanticRelationsXmlLogger::~SemanticRelationsXmlLogger() +{} + +void SemanticRelationsXmlLogger::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + AbstractLinguisticLogger::init(unitConfiguration,manager); + + m_language=manager->getInitializationParameters().media; + + try + { + m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + SEMLOGINIT; + LWARN << "No 'graph' parameter in unit configuration '" + << unitConfiguration.getName() << "' ; using PosGraph"; + m_graph=string("PosGraph"); + } +} + +LimaStatusCode SemanticRelationsXmlLogger:: +process(AnalysisContent& analysis) const +{ + TimeUtils::updateCurrentTime(); + + SEMLOGINIT; + LERROR << "SemanticRelationsXmlLogger"; + + AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); + + const LinguisticAnalysisStructure::AnalysisGraph& graph = + *(static_cast(analysis.getData(m_graph))); + + LinguisticGraph* lingGraph = const_cast(graph.getGraph()); + VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) { + SEMLOGINIT; + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } + + ofstream out; + if (!openLogFile(out,metadata->getMetaData("FileName"))) { + SEMLOGINIT; + LERROR << "Can't open log file "; + return UNKNOWN_ERROR; + } + + uint64_t offset(0); + try { + offset=atoi(metadata->getMetaData("StartOffset").c_str()); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + uint64_t offsetIndexingNode(0); + try { + offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str()); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + std::string docId(""); + try { + docId=metadata->getMetaData("DocId"); + } + catch (LinguisticProcessingException& e) { + // do nothing: not set in analyzeText (only in analyzeXmlDocuments) + } + + out << "" << endl; + +// LDEBUG << "SemanticRelationsXmlLogger on graph " << m_graph; + + //look at all vertices for annotations + AnnotationGraphVertexIt itv, itv_end; + boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); + for (; itv != itv_end; itv++) + { + LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv; + if (annotationData->hasAnnotation(*itv,("SemanticAnnotation"))) + { +// LDEBUG << " it has SemanticRelationAnnotation"; + const SemanticAnnotation* annot = 0; + try + { + annot = annotationData->annotation(*itv,("SemanticAnnotation")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) + { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticRelation"; + continue; + } + + // output + out << "getType() << "\">" << endl + << vertexStringForSemanticAnnotation("vertex",*itv,tokenMap,annotationData,offset) + << "" << endl; + } + } + + // look at all edges for relations + AnnotationGraphEdgeIt it,it_end; + const AnnotationGraph& annotGraph=annotationData->getGraph(); + boost::tie(it, it_end) = edges(annotGraph); + for (; it != it_end; it++) { + LDEBUG << "SemanticRelationsXmlLogger on annotation edge " + << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()); + if (annotationData->hasAnnotation(*it,("SemanticRelation"))) + { + SEMLOGINIT; + LDEBUG << "found semantic relation"; + const SemanticRelationAnnotation* annot = 0; + try + { + annot = annotationData->annotation(*it,("SemanticRelation")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) + { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticAnnotation"; + continue; + } + + //output + out << "type() << "\">" << endl + << vertexStringForSemanticAnnotation("source",source(*it,annotGraph),tokenMap,annotationData,offset) + << vertexStringForSemanticAnnotation("target",target(*it,annotGraph),tokenMap,annotationData,offset) + << "" << endl; + + } + } + +// LDEBUG << " all vertices done"; + out << "" << endl; + out.close(); + + TimeUtils::logElapsedTime("SemanticRelationsXmlLogger"); + return SUCCESS_ID; +} + +std::string SemanticRelationsXmlLogger:: +vertexStringForSemanticAnnotation(const std::string& vertexRole, + const AnnotationGraphVertex& vertex, + const VertexTokenPropertyMap& tokenMap, + AnnotationData* annotationData, + uint64_t offset) const +{ + ostringstream oss; + + // get id of the corresponding vertex in analysis graph + LinguisticGraphVertex v; + if (!annotationData->hasIntAnnotation(vertex,Common::Misc::utf8stdstring2limastring(m_graph))) + { + // SEMLOGINIT; + // LDEBUG << *itv << " has no " << m_graph << " annotation. Skeeping it."; + return ""; + } + v = annotationData->intAnnotation(vertex,Common::Misc::utf8stdstring2limastring(m_graph)); + LinguisticAnalysisStructure::Token* vToken = tokenMap[v]; + // LDEBUG << "SemanticRelationsXmlLogger tokenMap[" << v << "] = " << vToken; + if (vToken == 0) + { + SEMLOGINIT; + LERROR << "Vertex " << v << " has no entry in the analysis graph token map. This should not happen !!"; + return ""; + } + + // get annotation : element in relation can be an entity => get entity type + // otherwise, its type is "token" + std::string type("token"); + + auto matches = annotationData->matches(m_graph,v,"annot"); + for (auto it = matches.begin(); it != matches.end(); it++) + { + if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { + const SpecificEntityAnnotation* annot = 0; + try { + annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity")) + .pointerValue(); + } + catch (const boost::bad_any_cast& e) { + SEMLOGINIT; + LERROR << "This annotation is not a SemanticAnnotation"; + continue; + } + type=Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityName(annot->getType())); + break; + } + } + + oss << " <" << vertexRole + << " type=\"" << type << "\"" + << " pos=\"" << offset+vToken->position() << "\"" + << " len=\"" << vToken->length() << "\"" + << " string=\"" << vToken->stringForm() << "\"" + << "/>" << endl; + return oss.str(); +} + + +} // SemanticAnalysis +} // LinguisticProcessing +} // Lima + + + + + +#else + + +// version master + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + + + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRelationsXmlLogger.cpp + * @author (romaric.besancon@cea.fr) + * @date Mon Sep 17 2007 + * copyright Copyright (C) 2007 by CEA LIST + * + ***********************************************************************/ + + #include "SemanticRelationsXmlLogger.h" #include "SemanticRelationAnnotation.h" #include "SemanticAnnotation.h" // #include "common/linguisticData/linguisticData.h" -#include "common/misc/strwstrtools.h" -#include "common/misc/traceUtils.h" +//#include "common/misc/strwstrtools.h" +//#include "common/misc/traceUtils.h" +#include "common/MediaticData/mediaticData.h" +#include "common/Data/strwstrtools.h" +#include "common/time/traceUtils.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/annotationGraph/AnnotationData.h" +//#include "common/annotationGraph/AnnotationData.h" #include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" #include "linguisticProcessing/core/LinguisticAnalysisStructure/Token.h" @@ -52,7 +362,7 @@ namespace Lima { namespace LinguisticProcessing { namespace SemanticAnalysis { -SimpleFactory +SimpleFactory semanticRelationsXmlLoggerFactory(SEMANTICRELATIONSXMLLOGGER_CLASSID); SemanticRelationsXmlLogger::SemanticRelationsXmlLogger() : @@ -72,7 +382,7 @@ void SemanticRelationsXmlLogger::init( { AbstractLinguisticLogger::init(unitConfiguration,manager); - m_language=manager->getInitializationParameters().language; + m_language=manager->getInitializationParameters().media; try { @@ -152,13 +462,13 @@ process(AnalysisContent& analysis) const for (; itv != itv_end; itv++) { LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv; - if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation"))) + if (annotationData->hasAnnotation(*itv,("SemanticAnnotation"))) { // LDEBUG << " it has SemanticRelationAnnotation"; const SemanticAnnotation* annot = 0; try { - annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation")) + annot = annotationData->annotation(*itv,("SemanticAnnotation")) .pointerValue(); } catch (const boost::bad_any_cast& e) @@ -182,14 +492,14 @@ process(AnalysisContent& analysis) const for (; it != it_end; it++) { LDEBUG << "SemanticRelationsXmlLogger on annotation edge " << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()); - if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation"))) + if (annotationData->hasAnnotation(*it,("SemanticRelation"))) { SEMLOGINIT; LDEBUG << "found semantic relation"; const SemanticRelationAnnotation* annot = 0; try { - annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation")) + annot = annotationData->annotation(*it,("SemanticRelation")) .pointerValue(); } catch (const boost::bad_any_cast& e) @@ -247,9 +557,8 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, // otherwise, its type is "token" std::string type("token"); - std::set< uint32_t > matches = annotationData->matches(m_graph,v,"annot"); - for (std::set< uint32_t >::const_iterator it = matches.begin(); - it != matches.end(); it++) + auto matches = annotationData->matches(m_graph,v,"annot"); + for (auto it = matches.begin(); it != matches.end(); it++) { if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { const SpecificEntityAnnotation* annot = 0; @@ -262,7 +571,7 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, LERROR << "This annotation is not a SemanticAnnotation"; continue; } - type=Common::Misc::limastring2utf8stdstring(Common::LinguisticData::LinguisticData::single().getEntityName(annot->getType())); + type=Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityName(annot->getType())); break; } } @@ -280,3 +589,7 @@ vertexStringForSemanticAnnotation(const std::string& vertexRole, } // SemanticAnalysis } // LinguisticProcessing } // Lima + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h index 53d135d57..29dfb7492 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRelationsXmlLogger.h @@ -1,3 +1,12 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + /* Copyright 2002-2013 CEA LIST @@ -78,3 +87,97 @@ class SemanticRelationsXmlLogger : public AbstractLinguisticLogger } // Lima #endif + + + +#else + + +// version master + + +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRelationsXmlLogger.h + * @author Romaric Besancon (romaric.besancon@cea.fr) + * @date Wed Sep 12 2007 + * copyright Copyright (C) 2007 by CEA LIST + * Project s2lp + * + * @brief xml logger for the semantic relation annotations from the + * annotation graph + * + * + ***********************************************************************/ + +#ifndef SEMANTICRELATIONSXMLLOGGERSEMANTICRELATIONSXMLLOGGER_H +#define SEMANTICRELATIONSXMLLOGGERSEMANTICRELATIONSXMLLOGGER_H + +#include "linguisticProcessing/core/LinguisticProcessors/AbstractTextualAnalysisDumper.h" +#include "linguisticProcessing/core/LinguisticProcessors/AbstractLinguisticLogger.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SemanticAnalysis +{ + +#define SEMANTICRELATIONSXMLLOGGER_CLASSID "SemanticRelationsXmlLogger" +class SemanticRelationsXmlLogger : public AbstractLinguisticLogger +{ +public: + SemanticRelationsXmlLogger(); + + virtual ~SemanticRelationsXmlLogger(); + + virtual void init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + ; + + virtual LimaStatusCode process( + AnalysisContent& analysis) const; + +private: + MediaId m_language; + std::string m_graph; + + // private memeber functions + std::string vertexStringForSemanticAnnotation(const std::string& vertexRole, + const LinguisticGraphVertex& vertex, + const VertexTokenPropertyMap& tokenMap, + Common::AnnotationGraphs::AnnotationData* annotationData, + uint64_t offset) const; + +}; + +} // SemanticAnalysis +} // LinguisticProcessing +} // Lima + +#endif + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp index 736bfc0fd..575fdb5a3 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.cpp @@ -1,3 +1,13 @@ +#ifdef ANTINNO_SPECIFIC + + + + +// antinno travaille avec la version 2.1-patches tant que la version master n'est pas synchronisée + + + + /* Copyright 2002-2014 CEA LIST @@ -19,14 +29,17 @@ /************************************************************************ * * @file SemanticRoleLabelingLoader.cpp - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar * @date 2014 - * copyright Copyright (C) 2014 by CEA LIST + * copyright Copyright (C) 2014-2016 by CEA LIST ***********************************************************************/ #include "SemanticRoleLabelingLoader.h" #include "LimaConllTokenIdMapping.h" - +#ifdef ANTINNO_SPECIFIC +#include +#endif #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/Data/strwstrtools.h" #include "common/MediaticData/mediaticData.h" @@ -43,6 +56,7 @@ #include #include +#include #include #include @@ -64,6 +78,15 @@ namespace SemanticAnalysis { SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "[\\r\\n]" +#define CONLLFIELDSEPARATOR "\\t" +/* +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "\n+" +#define CONLLFIELDSEPARATOR "\t" +*/ + // Conll handler struct ConllHandler { @@ -117,6 +140,7 @@ class SemanticRoleLabelingLoaderPrivate MediaId m_language; std::string m_graph; + QString m_model; }; @@ -124,7 +148,8 @@ class SemanticRoleLabelingLoaderPrivate //*********************************************************************** SemanticRoleLabelingLoaderPrivate::SemanticRoleLabelingLoaderPrivate(): m_language(0), -m_graph("PosGraph") +m_graph("PosGraph"), +m_model("VerbNet") {} SemanticRoleLabelingLoaderPrivate::~SemanticRoleLabelingLoaderPrivate() @@ -156,6 +181,11 @@ void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfig m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); } catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_model = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("model").c_str()); + } + catch (NoSuchParam& ) {} // keep default value } @@ -171,6 +201,7 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); LimaConllTokenIdMapping* limaConllMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); +<<<<<<< .mine LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); if (metadata == 0) { @@ -179,6 +210,7 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co } QString fileName = QString::fromUtf8((metadata->getMetaData("FileName")+m_inputFileExtension).c_str()); + QFile file(fileName); @@ -210,30 +242,500 @@ LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) co { int sentenceIndex=it->first; QString sentence=it->second; - if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping,sentence)){ + if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping, sentence)) + { +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process there is/are " << cHandler.m_verbalClassNb << "verbal class(es) for this sentence " ; - for (int vClassIndex=0;vClassIndexm_model + "." + verbalClass; + } + LimaString verbalClass= verbalClasses.join("|"); + AnnotationGraphVertex annotPredicateVertex=annotationData->createAnnotationVertex(); + annotationData->addMatching("PosGraph", posGraphPredicateVertex, "annot", annotPredicateVertex); + annotationData->annotate(annotPredicateVertex, "Predicate", verbalClass); + + +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process: annotation vertex"<< annotPredicateVertex <<"was created for the verbal class "<< annotationData->stringAnnotation(annotPredicateVertex, "Predicate") << "and the PoS graph vertex"<>::iterator semRoleIt; + for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ + LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; + + QStringList semanticRoles = (*semRoleIt).second.split("|"); + //for (QString& semanticRole: semanticRoles) + // Modif NAN compatibilité de compilation + for (QStringList::iterator it=semanticRoles.begin(); it!=semanticRoles.end(); ++it) + { + QString& semanticRole = *it; + if (!semanticRole.isEmpty()) + semanticRole = m_d->m_model + "." + semanticRole; + } + LimaString semanticRole= semanticRoles.join("|"); + AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); + AnnotationGraphEdge roleEdge=annotationData->createAnnotationEdge(annotPredicateVertex, annotRoleVertex); + annotationData->annotate(roleEdge, "SemanticRole", semanticRole); + annotationData->addMatching("PosGraph", posGraphRoleVertex, "annot", annotRoleVertex); + + +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process: annotation edge" << roleEdge << "annotated " << annotationData->stringAnnotation(roleEdge, "SemanticRole")<< "was created for" << verbalClass << " and the PoS graph vertices " << posGraphPredicateVertex << "and" << posGraphRoleVertex ; +#endif + } + } + } + } + return SUCCESS_ID; +} + + + +ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph): +m_language(language), +m_analysis(analysis), +m_graph(graph), +m_descriptorSeparator(CONLLFIELDSEPARATOR), +m_tokenSeparator(CONLLTOKENSEPARATOR), +m_verbalClasses(), +m_semanticRoles(), +m_verbalClassNb() +{ +} +ConllHandler::~ConllHandler() +{ +} + +// designed to be repeated on each sentence +bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMapping* limaConllMapping, const QString & sent) +{ + SEMANTICANALYSISLOGINIT; + ConllHandler cHandler(m_language, m_analysis, m_graph); + QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); + if (sentenceTokens.isEmpty()) + { + return false; + } + QString firstSentenceToken=(*sentenceTokens.constBegin()); + int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); + m_verbalClassNb = descriptorsNb - NBCOLSINSRLBEFOREFRAME - 1; + int classIndex=0; + if (m_verbalClassNb > 0) + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation" << m_verbalClassNb << sentenceI << " : \n" << sent ; +#endif + m_verbalClasses.clear(); + m_verbalClasses.resize(m_verbalClassNb); + m_semanticRoles.clear(); + m_semanticRoles.resize(m_verbalClassNb); + //repeated on each token of the sentence, that is on each line +#ifdef ANTINNO_SPECIFIC +BOOST_FOREACH (const auto & token, sentenceTokens) +#else + for (const auto & token: sentenceTokens) +#endif + { + int roleNumbers=0; + QStringList descriptors=cHandler.splitSegment(token,m_descriptorSeparator); + if (descriptors.size()>=NBCOLSINSRLBEFOREFRAME+m_verbalClassNb) + { + int conllTokenId=descriptors[0].toInt(); + QString conllToken=descriptors[1]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation token " << conllTokenId << conllToken; +#endif + if(descriptors[NBCOLSINSRLBEFOREFRAME]!="_") + { + QString verbalClass=descriptors[NBCOLSINSRLBEFOREFRAME]; + QString vClass=descriptors[NBCOLSINSRLBEFOREFRAME]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation verbalClass" << vClass; +#endif + LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); + if (classIndex >= m_verbalClasses.size()) + { + LERROR << "ConllHandler::extractSemanticInformation classIndex error" << classIndex; + break; + } + m_verbalClasses[classIndex]=qMakePair(limaTokenId, vClass); + classIndex++; + } +#ifdef ANTINNO_SPECIFIC + BOOST_FOREACH (auto roleTargetFieldIndex, boost::irange(0,m_verbalClassNb)) +#else + for (auto roleTargetFieldIndex : boost::irange(0,m_verbalClassNb)) +#endif + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation"<<"nb descriptors and roleTargetFieldIndex" << descriptors.size() << roleTargetFieldIndex ; +#endif + if (NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex >= descriptors.size()) + { + LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; + break; + } + if (descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]!="_") + { + QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]; + + LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); + if(limaTokenId!=0) + { +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation argument "<> sRoles; + if (roleTargetFieldIndex >= m_semanticRoles.size()) + { + LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error 2" << roleTargetFieldIndex; + break; + } + m_semanticRoles[roleTargetFieldIndex].push_back(make_pair(limaTokenId,semanticRoleLabel)); + } + roleNumbers++; + } + } + } + } + } + return classIndex != 0; +} + + + +QStringList ConllHandler::splitSegment(const QString & segment, QRegExp separator) +{ + QStringList segmentsSplited; + segmentsSplited =segment.split(QRegExp(separator),QString::SkipEmptyParts); + return segmentsSplited; +} + +LinguisticGraphVertex ConllHandler::getLimaTokenId(int conllTokenId, int sentenceI, LimaConllTokenIdMapping* limaConllMapping) +{ + SEMANTICANALYSISLOGINIT; + std::map< int,std::map< int,LinguisticGraphVertex>>::iterator limaConllMappingIt; + limaConllMappingIt=limaConllMapping->find(sentenceI); + if (limaConllMappingIt == limaConllMapping->end()) + { + LERROR << "Sentence " << sentenceI << " not found"; + return 0; + } + std::map< int,LinguisticGraphVertex> limaConllId=(*limaConllMappingIt).second; + std::map< int,LinguisticGraphVertex>::iterator limaConllIdIt=limaConllId.find(conllTokenId); + if (limaConllIdIt==limaConllId.end()) + { + LERROR << "Conll token id " << conllTokenId << " not found"; + return 0; + } + LinguisticGraphVertex limaTokenId=limaConllIdIt->second; + return limaTokenId; +} + +} +} +} // end namespace + + + + + + +#else + + +// version master + + + +/* + Copyright 2002-2014 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/************************************************************************ + * + * @file SemanticRoleLabelingLoader.cpp + * @author Clémence Filmont + * @author Gael de Chalendar + * @date 2014 + * copyright Copyright (C) 2014-2016 by CEA LIST + ***********************************************************************/ + +#include "SemanticRoleLabelingLoader.h" +#include "LimaConllTokenIdMapping.h" +#ifdef ANTINNO_SPECIFIC +#include +#endif +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/Data/strwstrtools.h" +#include "common/MediaticData/mediaticData.h" +#include "linguisticProcessing/LinguisticProcessingCommon.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationData.h" +#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" +#include "linguisticProcessing/core/Automaton/recognizerData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" +#include "linguisticProcessing/core/LinguisticProcessors/LinguisticMetaData.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" + +#include +#include "QStringList" +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; +using namespace Lima::LinguisticProcessing::ApplyRecognizer; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::Common::AnnotationGraphs; +using namespace Lima::LinguisticProcessing::SemanticAnalysis; +using namespace Lima::Common::AnnotationGraphs; + + +namespace Lima { +namespace LinguisticProcessing { +namespace SemanticAnalysis { + +SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); + +#define NBCOLSINSRLBEFOREFRAME 11 +#define CONLLTOKENSEPARATOR "\n+" +#define CONLLFIELDSEPARATOR "\t" + +// Conll handler +struct ConllHandler +{ + ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph); + virtual ~ConllHandler(); + + /** + * @brief extract semantic annotations associated to token + * @param sentenceIndex the index of the current sentence + * @param limaConllMapping the chosen lima conll token id mapping + * @param sentence the current sentence + * @return true if any verbal class is found, false otherwise + */ + bool extractSemanticInformation(int sentenceIndex, LimaConllTokenIdMapping* limaConllMapping, const QString & sentence); + + /** + * @brief split a text into different types segments + * @param segment the segment to split + * @param separator the separator used to split + * @return the segment split + */ + QStringList splitSegment(const QString & segment, QRegExp separator); + + /** + * @brief get the lima token id matching any conll token one from the same text + * @param conllTokenId the conll token id one search the matched lima id + * @param sentenceNb the index of the current sentence + * @param limaConllMapping the chosen lima conll token id mapping + * @return the lima token id + * @note function to put in the LimaConllTokenIdMapping class? + */ + LinguisticGraphVertex getLimaTokenId(int conllTokenId, int sentenceIndex, LimaConllTokenIdMapping* limaConllMapping); + + + MediaId m_language; + AnalysisContent& m_analysis; + LinguisticAnalysisStructure::AnalysisGraph* m_graph; + QRegExp m_descriptorSeparator; + QRegExp m_tokenSeparator; + QVector< QPair > m_verbalClasses; + QVector < std::vector> >m_semanticRoles; + int m_verbalClassNb; +}; + + +class SemanticRoleLabelingLoaderPrivate +{ + friend class SemanticRoleLabelingLoader; + SemanticRoleLabelingLoaderPrivate(); + ~SemanticRoleLabelingLoaderPrivate(); + + MediaId m_language; + std::string m_graph; + QString m_model; +}; + + + +//*********************************************************************** +SemanticRoleLabelingLoaderPrivate::SemanticRoleLabelingLoaderPrivate(): +m_language(0), +m_graph("PosGraph"), +m_model("VerbNet") +{} + +SemanticRoleLabelingLoaderPrivate::~SemanticRoleLabelingLoaderPrivate() +{ +} + +//*********************************************************************** +SemanticRoleLabelingLoader::SemanticRoleLabelingLoader(): + AnalysisLoader(), + m_d(new SemanticRoleLabelingLoaderPrivate()) +{ +} + +SemanticRoleLabelingLoader::~SemanticRoleLabelingLoader() +{ + delete m_d; +} + +//*********************************************************************** + +void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, Manager* manager) +{ + + SEMANTICANALYSISLOGINIT; + m_d->m_language=manager->getInitializationParameters().media; + AnalysisLoader::init(unitConfiguration,manager); + try + { + m_d->m_graph=unitConfiguration.getParamsValueAtKey("graph"); + } + catch (NoSuchParam& ) {} // keep default value + try + { + m_d->m_model = QString::fromUtf8(unitConfiguration.getParamsValueAtKey("model").c_str()); + } + catch (NoSuchParam& ) {} // keep default value +} + + +LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) const +{ + SEMANTICANALYSISLOGINIT; + AnalysisGraph* tokenList=static_cast(analysis.getData(m_d->m_graph)); + if (tokenList==0) + { + LERROR << "graph " << m_d->m_graph << " has not been produced: check pipeline" ; + return MISSING_DATA; + } + AnnotationData* annotationData = static_cast(analysis.getData("AnnotationData")); + LimaConllTokenIdMapping* limaConllMapping = static_cast(analysis.getData("LimaConllTokenIdMapping")); + + QString fileName = getInputFile(analysis); +======= + LinguisticMetaData* metadata=static_cast(analysis.getData("LinguisticMetaData")); + if (metadata == 0) + { + LERROR << "no LinguisticMetaData ! abort"; + return MISSING_DATA; + } +>>>>>>> .r8104 + + QString fileName = QString::fromUtf8((metadata->getMetaData("FileName")+m_inputFileExtension).c_str()); + + QFile file(fileName); + + + if (!file.open(QIODevice::ReadOnly)) + { + LERROR << "cannot open file" << fileName; + return CANNOT_OPEN_FILE_ERROR; + } + int sentenceNb=1; + std::map sentences; + while (!file.atEnd()) + { + QByteArray text=file.readLine(); + QString textString = QString::fromUtf8(text.constData()); + //One assume that the input file does not start with a blank line + if (textString.size()<3) + { + sentenceNb++; + } + else + { + QString becomingSentence=sentences[sentenceNb]+textString; + sentences[sentenceNb]= becomingSentence; + } + } + + ConllHandler cHandler(m_d->m_language, analysis, tokenList); + for (std::map::iterator it=sentences.begin(); it!=sentences.end(); ++it) + { + int sentenceIndex=it->first; + QString sentence=it->second; + if(cHandler.extractSemanticInformation(sentenceIndex, limaConllMapping, sentence)) + { +#ifdef DEBUG_LP + LDEBUG << "SemanticRoleLabelingLoader::process there is/are " << cHandler.m_verbalClassNb << "verbal class(es) for this sentence " ; +#endif + for (int vClassIndex=0;vClassIndexm_model + "." + verbalClass; + } + LimaString verbalClass= verbalClasses.join("|"); AnnotationGraphVertex annotPredicateVertex=annotationData->createAnnotationVertex(); annotationData->addMatching("PosGraph", posGraphPredicateVertex, "annot", annotPredicateVertex); annotationData->annotate(annotPredicateVertex, "Predicate", verbalClass); +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process: annotation vertex"<< annotPredicateVertex <<"was created for the verbal class "<< annotationData->stringAnnotation(annotPredicateVertex, "Predicate") << "and the PoS graph vertex"<>::iterator semRoleIt; for (semRoleIt=cHandler.m_semanticRoles[vClassIndex].begin(); semRoleIt!=cHandler.m_semanticRoles[vClassIndex].end();semRoleIt++){ LinguisticGraphVertex posGraphRoleVertex=(*semRoleIt).first; - LimaString semanticRole=(*semRoleIt).second; - AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); + QStringList semanticRoles = (*semRoleIt).second.split("|"); + //for (QString& semanticRole: semanticRoles) + // Modif NAN compatibilité de compilation + for (QStringList::iterator it=semanticRoles.begin(); it!=semanticRoles.end(); ++it) + { + QString& semanticRole = *it; + if (!semanticRole.isEmpty()) + semanticRole = m_d->m_model + "." + semanticRole; + } + LimaString semanticRole= semanticRoles.join("|"); + AnnotationGraphVertex annotRoleVertex=annotationData->createAnnotationVertex(); AnnotationGraphEdge roleEdge=annotationData->createAnnotationEdge(annotPredicateVertex, annotRoleVertex); annotationData->annotate(roleEdge, "SemanticRole", semanticRole); annotationData->addMatching("PosGraph", posGraphRoleVertex, "annot", annotRoleVertex); +#ifdef DEBUG_LP LDEBUG << "SemanticRoleLabelingLoader::process: annotation edge" << roleEdge << "annotated " << annotationData->stringAnnotation(roleEdge, "SemanticRole")<< "was created for" << verbalClass << " and the PoS graph vertices " << posGraphPredicateVertex << "and" << posGraphRoleVertex ; +#endif } } } @@ -247,8 +749,8 @@ ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, Linguist m_language(language), m_analysis(analysis), m_graph(graph), -m_descriptorSeparator("\t+"), -m_tokenSeparator("\n+"), +m_descriptorSeparator(CONLLFIELDSEPARATOR), +m_tokenSeparator(CONLLTOKENSEPARATOR), m_verbalClasses(), m_semanticRoles(), m_verbalClassNb() @@ -264,31 +766,46 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap SEMANTICANALYSISLOGINIT; ConllHandler cHandler(m_language, m_analysis, m_graph); QStringList sentenceTokens=cHandler.splitSegment(sent, m_tokenSeparator); + if (sentenceTokens.isEmpty()) + { + return false; + } QString firstSentenceToken=(*sentenceTokens.constBegin()); int descriptorsNb = cHandler.splitSegment(firstSentenceToken, m_descriptorSeparator).size(); - m_verbalClassNb = descriptorsNb -11; + m_verbalClassNb = descriptorsNb - NBCOLSINSRLBEFOREFRAME - 1; int classIndex=0; if (m_verbalClassNb > 0) { - LDEBUG << "ConllHandler::extractSemanticInformation" << sentenceI << " : \n" << sent ; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation" << m_verbalClassNb << sentenceI << " : \n" << sent ; +#endif m_verbalClasses.clear(); m_verbalClasses.resize(m_verbalClassNb); m_semanticRoles.clear(); m_semanticRoles.resize(m_verbalClassNb); //repeated on each token of the sentence, that is on each line - for (QStringList::const_iterator tokensIterator = sentenceTokens.constBegin(); tokensIterator != sentenceTokens.constEnd(); - ++tokensIterator) +#ifdef ANTINNO_SPECIFIC +BOOST_FOREACH (const auto & token, sentenceTokens) +#else + for (const auto & token: sentenceTokens) +#endif { int roleNumbers=0; - QStringList descriptors=cHandler.splitSegment((*tokensIterator),m_descriptorSeparator); - if (descriptors.size()>=11+m_verbalClassNb) + QStringList descriptors=cHandler.splitSegment(token,m_descriptorSeparator); + if (descriptors.size()>=NBCOLSINSRLBEFOREFRAME+m_verbalClassNb) { int conllTokenId=descriptors[0].toInt(); QString conllToken=descriptors[1]; - if(descriptors[10]!="-") +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation token " << conllTokenId << conllToken; +#endif + if(descriptors[NBCOLSINSRLBEFOREFRAME]!="_") { - QString verbalClass=descriptors[10]; - QString vClass=descriptors[10]; + QString verbalClass=descriptors[NBCOLSINSRLBEFOREFRAME]; + QString vClass=descriptors[NBCOLSINSRLBEFOREFRAME]; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation verbalClass" << vClass; +#endif LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if (classIndex >= m_verbalClasses.size()) { @@ -298,22 +815,30 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap m_verbalClasses[classIndex]=qMakePair(limaTokenId, vClass); classIndex++; } - for (int roleTargetFieldIndex=0; roleTargetFieldIndex= descriptors.size()) +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation"<<"nb descriptors and roleTargetFieldIndex" << descriptors.size() << roleTargetFieldIndex ; +#endif + if (NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex >= descriptors.size()) { LERROR << "ConllHandler::extractSemanticInformation roleTargetFieldIndex error" << roleTargetFieldIndex; break; } - if (descriptors[11+roleTargetFieldIndex]!="-") + if (descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]!="_") { - QString semanticRoleLabel=descriptors[11+roleTargetFieldIndex]; + QString semanticRoleLabel=descriptors[NBCOLSINSRLBEFOREFRAME+1+roleTargetFieldIndex]; LinguisticGraphVertex limaTokenId=cHandler.getLimaTokenId(conllTokenId, sentenceI, limaConllMapping); if(limaTokenId!=0) { - LDEBUG << "ConllHandler::extractSemanticInformation The PoS graph token id matching the conll token id " << conllTokenId << " is " << limaTokenId; +#ifdef DEBUG_LP + LDEBUG << "ConllHandler::extractSemanticInformation argument "<> sRoles; if (roleTargetFieldIndex >= m_semanticRoles.size()) { @@ -328,7 +853,7 @@ bool ConllHandler::extractSemanticInformation(int sentenceI, LimaConllTokenIdMap } } } - return classIndex; + return classIndex != 0; } @@ -365,3 +890,9 @@ LinguisticGraphVertex ConllHandler::getLimaTokenId(int conllTokenId, int sentenc } } // end namespace + + + + + +#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h index 7ca381c95..d78791565 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/SemanticRoleLabelingLoader.h @@ -18,9 +18,10 @@ */ /** * @file SemanticRoleLabelingLoader.h - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar * @date 2014-04-17 - * copyright Copyright (C) 2014 by CEA LIST + * copyright Copyright (C) 2014-2016 by CEA LIST */ #ifndef SEMANTICROLELABELINGLOADER_H @@ -38,7 +39,8 @@ class SemanticRoleLabelingLoaderPrivate; /** * @brief A Semantic Role Labeling Loader class - * @author Clémence Filmont + * @author Clémence Filmont + * @author Gael de Chalendar */ class SemanticRoleLabelingLoader : public AnalysisLoader { @@ -52,7 +54,7 @@ class SemanticRoleLabelingLoader : public AnalysisLoader LimaStatusCode process(AnalysisContent& analysis) const; private: - SemanticRoleLabelingLoaderPrivate* m_d;; + SemanticRoleLabelingLoaderPrivate* m_d; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp deleted file mode 100644 index 28ca49dff..000000000 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SemanticAnalysis/s/SemanticRoleLabelingLoader.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* -Copyright 2002-2014 CEA LIST - -This file is part of LIMA. - -LIMA is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -LIMA is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with LIMA. If not, see -*/ -/************************************************************************ -* -* @file SemanticRoleLabelingLoader.cpp -* @author Clémence Filmont -* @date 2014-- -* copyright Copyright (C) 2014 by CEA LIST -* Project mm_linguisticprocessing -* -* -***********************************************************************/ - -#include "SemanticRoleLabelingLoader.h" -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "common/Data/strwstrtools.h" -#include "linguisticProcessing/core/Automaton/recognizerMatch.h" -#include "linguisticProcessing/core/Automaton/recognizerData.h" -#include "common/MediaticData/mediaticData.h" -#include "linguisticProcessing/common/annotationGraph/AnnotationGraph.h" -#include -#include "QStringList" - -using namespace std; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; -using namespace Lima::LinguisticProcessing::ApplyRecognizer; -using namespace Lima::Common::XMLConfigurationFiles; - - - -namespace Lima { -namespace LinguisticProcessing { -namespace SemanticAnalysis { - -SimpleFactory SemanticRoleLabelingFactory(SEMANTICROLELABELINGLOADER_CLASSID); - - -//*********************************************************************** -SemanticRoleLabelingLoader::SemanticRoleLabelingLoader(): -m_language(0), -m_graph("AnalysisGraph"), -m_suffix(".conll") -{} - -SemanticRoleLabelingLoader::~SemanticRoleLabelingLoader() -{ -} - -//*********************************************************************** - -void SemanticRoleLabelingLoader::init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager){ - - PROCESSORSLOGINIT; - m_language=manager->getInitializationParameters().media; - AnalysisLoader::init(unitConfiguration,manager); - try - { - m_suffix=unitConfiguration.getParamsValueAtKey("outputSuffix"); - } - catch (NoSuchParam& ) {} // keep default value - AnalysisLoader::init(unitConfiguration,manager); - } - - - LimaStatusCode SemanticRoleLabelingLoader::process(AnalysisContent& analysis) const{ - QFile file("/home/clemence/textes_test/jamaica_out.conll"); - } - - -SemanticRoleLabelingLoader::ConllHandler::ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph): -m_tagIndex(), -m_language(language), -m_analysis(analysis), -m_graph(graph), -m_position(0), -m_length(0), -m_type(), -m_string(), -m_currentElement() -{ - PROCESSORSLOGINIT; - LDEBUG << "SemanticRoleLabelingLoader::ConllHandler constructor"; -} -SemanticRoleLabelingLoader::ConllHandler::~ConllHandler(){} - -// repeated on each line beginning -bool extractSemanticRole(const QString & tokenDescription) -{ - QStringList descriptors; - descriptors=tokenDescription.split(QRegExp("\\t+")); -// cout << descriptors[11]< -*/ -/************************************************************************ -* -* @file SemanticRoleLabelingLoader.h -* @author Clémence Filmont -* @date 2014-04-17 -* copyright Copyright (C) 2014 by CEA LIST -* Project mm_linguisticprocessing -* -* @brief an Semantic Role Labeling Loader class -* -* -***********************************************************************/ - -#ifndef SEMANTICROLELABELINGLOADER_H -#define SEMANTICROLELABELINGLOADER_H - -#include "linguisticProcessing/core/LinguisticProcessors/AnalysisLoader.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/LinguisticGraph.h" -#include -#include -#include - -namespace Lima { -namespace LinguisticProcessing { -namespace SemanticAnalysis { - -#define SEMANTICROLELABELINGLOADER_CLASSID "SemanticRoleLabelingLoader" - -class SemanticRoleLabelingLoader : public AnalysisLoader -{ - public: - SemanticRoleLabelingLoader(); - virtual ~SemanticRoleLabelingLoader(); - - void init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager); - - LimaStatusCode process(AnalysisContent& analysis) const; - - private: - MediaId m_language; - std::string m_graph; - std::string m_suffix; -// QXmlSimpleReader* m_parser; /*< XML parser for the loader*/ - - // XML handler - class ConllHandler - { - public: - QMap m_tagIndex; - - ConllHandler(MediaId language, AnalysisContent& analysis, LinguisticAnalysisStructure::AnalysisGraph* graph); - virtual ~ConllHandler(); - - - bool extractSemanticRole(const QString & expectedRole);// repeated on each line beginning - - - - private: - MediaId m_language; - AnalysisContent& m_analysis; - LinguisticAnalysisStructure::AnalysisGraph* m_graph; - uint64_t m_position; - uint64_t m_length; - std::string m_type; - std::string m_string; - std::string m_currentElement; - - }; - -}; - -} // end namespace -} // end namespace -} // end namespace - -#endif \ No newline at end of file diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp index 954a6eef7..0c3e34628 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTime.cpp @@ -264,12 +264,13 @@ updateCurrentDate(AnalysisContent& analysis, unsigned short NormalizeDate::getDayFromString(const LimaString& numdayString) const { SELOGINIT; - // try first conversion of type "premier" -> 1 - unsigned short day = m_resources->getCardinalFromNumberOrdinal(numdayString); + // try to extract number as int from string like 4th, 22nd, 1st or like 17 + unsigned short day = m_resources->getValueFromNumberOrdinal(numdayString); LDEBUG << "NormalizeDate::getDayFromString: testConversion 1 of " << numdayString << "1 day=" << day; + // try first conversion of type "premier" -> 1 // then try conversion of type "10th" -> 10 if( day == NormalizeDateTimeResources::no_day ) { - day = m_resources->getDayNumberFromWordOrdinal(numdayString); + day = m_resources->getValueFromWordCardinalOrOrdinal(numdayString); LDEBUG << "NormalizeDate::getDayFromString: testConversion 2 of " << numdayString << "1 day=" << day; } // then try conversion of type "10" -> 10 @@ -450,13 +451,23 @@ operator()(RecognizerMatch& m, // set interval QDate firstDayOfMonth(year,month,1); #ifdef DEBUG_LP - LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << firstDayOfMonth; +#ifdef ANTINNO_SPECIFIC + // FWI 21/09/2015 modifié temporairement + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << "????"; +#else + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_begin=" << firstDayOfMonth; +#endif #endif m.features().setFeature(DATE_BEGIN_FEATURE_NAME,firstDayOfMonth); if (month_end==0) { QDate date_end = firstDayOfMonth.addMonths(1).addDays(-1); #ifdef DEBUG_LP - LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << date_end; +#ifdef ANTINNO_SPECIFIC + // FWI 21/09/2015 modifié temporairement + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << "????"; +#else + LDEBUG << "NormalizeDate operator(): day=0 and month != 0 => date_end=" << date_end; +#endif #endif m.features().setFeature(DATE_END_FEATURE_NAME,date_end); } @@ -533,7 +544,9 @@ operator()(RecognizerMatch& m, m.features().setFeature(DATESTRING_FEATURE_NAME,m.getString()); } - QString dateSpan = QString::number(year); + QString dateSpan = "XXXX"; + if( year != 0 ) + dateSpan = QString::number(year); #ifdef DEBUG_LP LDEBUG << "NormalizeDate operator(): year: dateSpan=" << dateSpan; #endif diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp index c4582f499..f53d747af 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.cpp @@ -29,6 +29,7 @@ #include "linguisticProcessing/client/LinguisticProcessingException.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "boost/algorithm/string/split.hpp" #include "boost/algorithm/string/classification.hpp" @@ -56,9 +57,13 @@ MONTHSDAYS_MONTH_ID=std::string("m"); const std::string NormalizeDateTimeResources:: MONTHSDAYS_DAY_ID=std::string("d"); const std::string NormalizeDateTimeResources:: -MONTHSDAYS_ORDINAL_ID=std::string("o"); +WORD_CARDINAL_ID=std::string("c"); const std::string NormalizeDateTimeResources:: -MONTHSDAYS_SUFFIX_ID=std::string("s"); +WORD_CARDINAL_SEPARATOR_ID=std::string("s"); +const std::string NormalizeDateTimeResources:: +WORD_ORDINAL_SUFFIX_ID=std::string("w"); +const std::string NormalizeDateTimeResources:: +NUMBER_ORDINAL_SUFFIX_ID=std::string("n"); NormalizeDateTimeResources::NormalizeDateTimeResources(): @@ -89,7 +94,7 @@ init(GroupConfigurationStructure& unitConfiguration, try { tzDbFile = unitConfiguration.getParamsValueAtKey("timezoneDatabase"); - tzDbFile = resourcesPath + "/" + tzDbFile; + tzDbFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), tzDbFile.c_str()).toUtf8().constData(); // m_timezoneDatabase = new boost::local_time::tz_database(); // m_timezoneDatabase->load_from_file(tzDbFile); } @@ -112,7 +117,7 @@ init(GroupConfigurationStructure& unitConfiguration, try { string monthsDaysFile = unitConfiguration.getParamsValueAtKey("monthsDays"); - monthsDaysFile = resourcesPath + "/" + monthsDaysFile; + monthsDaysFile = Common::Misc::findFileInPaths(resourcesPath.c_str(), monthsDaysFile.c_str()).toUtf8().constData(); if (!readMonthDays(monthsDaysFile)) { SELOGINIT; LERROR << "Error loading monthsDays resources '" @@ -132,6 +137,7 @@ bool NormalizeDateTimeResources:: readMonthDays(const std::string& monthsDaysFile) { + m_wordCardinalSeparator[Common::Misc::utf8stdstring2limastring(" ")]=0; ifstream file(monthsDaysFile.c_str(), std::ifstream::binary); if (!file.good()) { return false; @@ -139,13 +145,13 @@ readMonthDays(const std::string& monthsDaysFile) string utf8line; LimaString line; while (file.good()) { - getline(file,utf8line); + utf8line = Lima::Common::Misc::readLine(file); if (!utf8line.empty()) { line=Common::Misc::utf8stdstring2limastring(utf8line); std::vector elements; split(elements,utf8line,is_any_of(MONTHSDAYS_MAIN_SEP)); - // three elements in line: (month|day|ordinal|suffix) num list,of,strings - if (elements.size()!=3) { + // three elements in line: (month|day|ordinal|cardinal|suffix) num list-of-strings + if (elements.size()!=3) { SELOGINIT; LWARN << "MonthsDaysResources: cannot parse line " << utf8line; continue; @@ -153,12 +159,14 @@ readMonthDays(const std::string& monthsDaysFile) map* names(0); if (elements[0] == MONTHSDAYS_MONTH_ID) { names=&m_months; } else if (elements[0] == MONTHSDAYS_DAY_ID) { names=&m_days; } - else if (elements[0] == MONTHSDAYS_ORDINAL_ID) { names=&m_ordinal; } - else if (elements[0] == MONTHSDAYS_SUFFIX_ID) { names=&m_ordinalSuffixes; } + else if (elements[0] == WORD_CARDINAL_SEPARATOR_ID) { names=&m_wordCardinalSeparator; } + else if (elements[0] == WORD_CARDINAL_ID) { names=&m_wordCardinal; } + else if (elements[0] == WORD_ORDINAL_SUFFIX_ID) { names=&m_wordOrdinalSuffixes; } + else if (elements[0] == NUMBER_ORDINAL_SUFFIX_ID) { names=&m_numberOrdinalSuffixes; } else { SELOGINIT; LWARN << "MonthsDaysResources: cannot parse line " << utf8line - << ": first element must be 'm' 'd', 'o' or 's'"; + << ": first element must be 'm' 'd', 'c', 'w', 'n' or 's'"; continue; } @@ -208,29 +216,87 @@ getDayNumber(const LimaString& dayName) const } unsigned short NormalizeDateTimeResources:: -getDayNumberFromWordOrdinal(const LimaString& dayName) const +getValueFromWordCardinalOrOrdinal(const LimaString& dayName) const { - map::const_iterator - it=m_ordinal.find(dayName); - if (it==m_ordinal.end()) { - return NormalizeDateTimeResources::no_day; + SELOGINIT; + unsigned short day(0); + // trim suffix first, second or th, or (ème, ième, ieme, eme) + LimaString numberAsString(dayName); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal() numberAsString=" + << numberAsString; + map::const_iterator suffixIt=m_wordOrdinalSuffixes.begin(); + for( ; suffixIt!=m_wordOrdinalSuffixes.end() ; suffixIt++ ) + { + const LimaString& suffix = (*suffixIt).first; + int index = dayName.indexOf(suffix, 0, Qt::CaseInsensitive); + if (index >= 0) { + numberAsString = LimaString(dayName.constData(),index); + day += (*suffixIt).second; + break; + } } - return (*it).second; + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: after trim numberAsString=" + << numberAsString << ", day=" << day; + if( numberAsString.isEmpty() ) + return day; + // compute value from left to right + int parsingPosition(0); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: parsingPosition=" << parsingPosition; + for( ; ; ) + { + int index(-1); + // identify component of number + map::const_iterator cardinalIt=m_wordCardinal.begin(); + for( ; cardinalIt!=m_wordCardinal.end() ; cardinalIt++ ) + { + const LimaString& word = (*cardinalIt).first; + int index = numberAsString.indexOf(word, parsingPosition, Qt::CaseInsensitive); + if (index >= 0) { + day += (*cardinalIt).second; + parsingPosition += word.length(); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: found" + << word << ", day=" << day << ", parsingPosition=" << parsingPosition; + break; + } + } + // skip separator + int skipIndex(-1); + do + { + map::const_iterator separatorIt=m_wordCardinalSeparator.begin(); + for( ; separatorIt!=m_wordCardinalSeparator.end() ; separatorIt++ ) + { + const LimaString& separator = (*separatorIt).first; + int skipIndex = numberAsString.indexOf(separator, parsingPosition, Qt::CaseInsensitive); + if (skipIndex == 0) { + parsingPosition += separator.length(); + LDEBUG << "NormalizeDateTimeResources::getValueFromWordCardinalOrOrdinal: found" + << separator << ", day=" << day << ", parsingPosition=" << parsingPosition; + break; + } + } + } while( skipIndex == 0 ); + if( index == -1 ) + break; + } + return day; } unsigned short NormalizeDateTimeResources:: -getCardinalFromNumberOrdinal(const LimaString& dayName) const +getValueFromNumberOrdinal(const LimaString& dayName) const { - // try to extract number as int from string - map::const_iterator it=m_ordinalSuffixes.begin(); - for( ; it!=m_ordinalSuffixes.end() ; it++ ) + // try to extract number as int from string like 4th, 22nd, 1st or 17 + map::const_iterator it=m_numberOrdinalSuffixes.begin(); + for( ; it!=m_numberOrdinalSuffixes.end() ; it++ ) { + // try to trim suffix th, nd, st or rd const LimaString& suffix = (*it).first; int index = dayName.indexOf(suffix, 0, Qt::CaseInsensitive); - if (index < 0) - continue; - LimaString numberAsString(dayName.constData(),index); + LimaString numberAsString(dayName); + if (index > 0) + numberAsString = LimaString(dayName.constData(),index); bool ok(false); + // try to convert trimmed string to int unsigned short day = numberAsString.toUShort(&ok); if( ok) return day; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h index bd66c209e..6ffd9bb6a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/NormalizeDateTimeResources.h @@ -59,8 +59,8 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR // const boost::local_time::tz_database& getTimezoneDatabase() const; unsigned short getMonthNumber(const LimaString& monthName) const; unsigned short getDayNumber(const LimaString& dayName) const; - unsigned short getCardinalFromNumberOrdinal(const LimaString& dayName) const; - unsigned short getDayNumberFromWordOrdinal(const LimaString& dayName) const; + unsigned short getValueFromWordCardinalOrOrdinal(const LimaString& dayName) const; + unsigned short getValueFromNumberOrdinal(const LimaString& dayName) const; static const unsigned short no_month=static_cast(-1); static const unsigned short no_day=static_cast(-1); @@ -70,8 +70,10 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR // boost::local_time::tz_database* m_timezoneDatabase; std::map m_months; std::map m_days; - std::map m_ordinal; - std::map m_ordinalSuffixes; + std::map m_wordCardinal; + std::map m_wordCardinalSeparator; + std::map m_wordOrdinalSuffixes; + std::map m_numberOrdinalSuffixes; // private member functions bool readMonthDays(const std::string& monthsDaysFile); @@ -81,8 +83,10 @@ class LIMA_SPECIFICENTITIES_EXPORT NormalizeDateTimeResources : public AbstractR static const std::string MONTHSDAYS_NAMELIST_SEP; static const std::string MONTHSDAYS_MONTH_ID; static const std::string MONTHSDAYS_DAY_ID; - static const std::string MONTHSDAYS_ORDINAL_ID; - static const std::string MONTHSDAYS_SUFFIX_ID; + static const std::string WORD_CARDINAL_ID; + static const std::string WORD_CARDINAL_SEPARATOR_ID; + static const std::string WORD_ORDINAL_SUFFIX_ID; + static const std::string NUMBER_ORDINAL_SUFFIX_ID; }; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp index f60586b98..29ee9f736 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.cpp @@ -55,11 +55,11 @@ namespace SpecificEntities { // factories for constraint functions defined in this file -ConstraintFunctionFactory -isASpecificEntityFactory(isASpecificEntityId); +ConstraintFunctionFactory + isAlphaPossessiveFactory(isAlphaPossessiveId); -ConstraintFunctionFactory - isInSameSpecificEntityFactory(isInSameSpecificEntityId); +ConstraintFunctionFactory + isASpecificEntityFactory(isASpecificEntityId); ConstraintFunctionFactory CreateSpecificEntityFactory(CreateSpecificEntityId); @@ -67,6 +67,9 @@ ConstraintFunctionFactory ConstraintFunctionFactory SetEntityFeatureFactory(SetEntityFeatureId); +ConstraintFunctionFactory + AddEntityFeatureAsEntityFactory(AddEntityFeatureAsEntityId); + ConstraintFunctionFactory AddEntityFeatureFactory(AddEntityFeatureId); @@ -80,6 +83,25 @@ ConstraintFunctionFactory NormalizeEntityFactory(NormalizeEntityId); +isAlphaPossessive:: +isAlphaPossessive(MediaId language, + const LimaString& complement): +ConstraintFunction(language,complement) +{ +} + +bool isAlphaPossessive::operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v, + AnalysisContent& /*analysis*/) const +{ + LinguisticGraph* lingGraph = const_cast(graph.getGraph()); +// Token* token=get(vertex_token,*(graph.getGraph()),v); + VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); + const TStatus& status = tokenMap[v]->status(); + return( status.isAlphaPossessive() ); +} + + isASpecificEntity:: isASpecificEntity(MediaId language, const LimaString& complement): @@ -142,103 +164,6 @@ bool isASpecificEntity::operator()(const LinguisticAnalysisStructure::AnalysisGr return false; } -isInSameSpecificEntity:: - isInSameSpecificEntity(MediaId language, - const LimaString& complement): - ConstraintFunction(language,complement), - m_type() -{ - if (! complement.isEmpty()) { - m_type=Common::MediaticData::MediaticData::single().getEntityType(complement); - } -} - -/** @brief Tests if the two given vertices are in the same specific entity - * - * There is several cases: - * - va1 and va2 are SE vertices : true iff va1 == va2 - * - va1 and va2 are standard vertices : true iff there is an outgoing edge - * in the annotation graph annotated with "belongstose" from each of them - * and toward the same vertex - * - va1 (va2) is a SE vertex and there is an outgoing edge in the annotation - * graph annotated with "belongstose" from va2 (va1) to va1 (va2). - * - * In all the cases, va1 and va2 are the uniq "morphannot" matches of v1 and v2 - * - * @note This method handles only the first level of SE: if a SE is recursively - * included in a second one, morph vertices from the first one and from the - * the second one (not in the first one) will NOT be considered as being in the - * same specific entity. - * @note It is considered that a morph vertex can be directly in only one SE. - * So, its annotation vertex will have at most one "belongstose" annotated - * outgoing edge. - */ -bool isInSameSpecificEntity::operator()( - const LinguisticAnalysisStructure::AnalysisGraph& /*graph*/, - const LinguisticGraphVertex& v1, - const LinguisticGraphVertex& v2, - AnalysisContent& analysis) const -{ - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); - AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); - AnnotationGraphVertex va1 = *(annotationData->matches(recoData->getGraphId(), v1, "annot").begin()); - AnnotationGraphVertex va2 = *(annotationData->matches(recoData->getGraphId(), v2, "annot").begin()); - - if ( (va1 == va2) && annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity")) ) - { // first case - return true; - } - AnnotationGraphVertex vase = std::numeric_limits::max(); - AnnotationGraphVertex va = std::numeric_limits::max(); - if (annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - vase = va1; - va = va2; - } - else if (annotationData->hasAnnotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) - { - vase = va2; - va = va1; - } - if (vase == std::numeric_limits::max()) - { // second case - AnnotationGraphOutEdgeIt it1, it1_end; - AnnotationGraphVertex se1 = std::numeric_limits::max(); - boost::tie(it1, it1_end) = out_edges(va1, annotationData->getGraph()); - for (; it1 != it1_end; it1++) - { - if ( annotationData->intAnnotation((*it1), Common::Misc::utf8stdstring2limastring("belongstose"))==1) - { - se1 = target(*it1, annotationData->getGraph()); - break; - } - } - if (se1 == std::numeric_limits::max()) - { - return false; - } - AnnotationGraphVertex se2 = std::numeric_limits::max(); - AnnotationGraphOutEdgeIt it2, it2_end; - boost::tie(it2, it2_end) = out_edges(va2, annotationData->getGraph()); - for (; it2 != it2_end; it2++) - { - if ( annotationData->intAnnotation((*it2), Common::Misc::utf8stdstring2limastring("belongstose"))==1) - { - se2 = target(*it2, annotationData->getGraph()); - break; - } - } - return (se1 == se2); - } - else - { // third case - bool ok; AnnotationGraphEdge e; - boost::tie(e, ok) = edge(va,vase,annotationData->getGraph()); - return (ok && (annotationData->intAnnotation(e, Common::Misc::utf8stdstring2limastring("belongstose"))==1)); - } -} - - CreateSpecificEntity::CreateSpecificEntity(MediaId language, const LimaString& complement): @@ -423,7 +348,18 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, annotationData->dumpFunction("SpecificEntity", new DumpSpecificEntityAnnotation()); } - RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + AnalysisData* rdata=analysis.getData("RecognizerData"); + if (rdata==0) { + SELOGINIT; + LERROR << "CreateSpecificEntity: missing data RecognizerData: entity will not be created"; + return false; + } + RecognizerData* recoData=static_cast(rdata); + if (recoData==0) { + SELOGINIT; + LERROR << "CreateSpecificEntity: missing data RecognizerData: entity will not be created"; + return false; + } std::string graphId=recoData->getGraphId(); // LDEBUG << " match is " << match; @@ -443,7 +379,14 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); VertexDataPropertyMap dataMap = get(vertex_data, *lingGraph); - const MorphoSyntacticData* dataHead = dataMap[annot.getHead()]; + LinguisticGraphVertex head = annot.getHead(); + if( head == 0 ) { + // take status of last element in match for eng + head = v2; + // or take status of first element in match (in fre?) + // head = v1; + } + const MorphoSyntacticData* dataHead = dataMap[head]; // Preparer le Token et le MorphoSyntacticData pour le nouveau noeud. Construits // a partir des infos de l'entitee nommee @@ -464,6 +407,9 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, elem.type = SPECIFIC_ENTITY; // MorphoSyntacticType if (! m_microsToKeep.empty()) { +#ifdef DEBUG_LP + LDEBUG << "CreateSpecificEntity, use micros from the rule "; +#endif // micros are given in the rules addMicrosToMorphoSyntacticData(newMorphData,dataHead,m_microsToKeep,elem); } @@ -472,18 +418,19 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, LDEBUG << "CreateSpecificEntity, use micros from config file "; #endif // use micros given in the config file : get the specific resource - // (specific to modex) + // (specific to modex) AddEntityFeature // WARN : some hard coded stuff here in resource names EntityType seType=match.getType(); if (seType.getGroupId() == 0) { SELOGINIT; LERROR << "CreateSpecificEntity::operator() null group id:" << seType; + delete newMorphData; return false; } - std::string resourceName= - Common::Misc::limastring2utf8stdstring(Common::MediaticData::MediaticData::single().getEntityGroupName(seType.getGroupId()))+"Micros"; - AbstractResource* res=LinguisticResources::single().getResource(m_language,resourceName); + const LimaString& resourceName = + Common::MediaticData::MediaticData::single().getEntityGroupName(seType.getGroupId())+"Micros"; + AbstractResource* res=LinguisticResources::single().getResource(m_language,resourceName.toUtf8().constData()); #ifdef DEBUG_LP LDEBUG << "Entities resource name is : " << resourceName; #endif @@ -506,6 +453,8 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, // cannot find micros for this type: error SELOGINIT; LERROR << "CreateSpecificEntity: missing resource " << resourceName ; + delete newMorphData; + return false; } } @@ -516,12 +465,23 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, match.positionBegin(), match.length()); - // always take status from first element in match - //if (match.size() == 1) - //{ - newToken->setStatus(tokenMap[v1]->status()); - //} + // take posessive tstatus from head + TStatus tStatus = tokenMap[head]->status(); + const TStatus& headTStatus = tokenMap[v2]->status(); + if(headTStatus.isAlphaPossessive()) { + tStatus.setAlphaPossessive(true); + } + newToken->setStatus(tStatus); + if (newMorphData->empty()) + { + SELOGINIT; + LERROR << "CreateSpecificEntity::operator() Found no morphosyntactic data for new vertex. Abort."; + delete newToken; + delete newMorphData; + assert(false); + return false; + } // LDEBUG << " Updating morphologic graph "<< graphId; // creer le noeud et ses 2 arcs LinguisticGraphVertex newVertex; @@ -562,8 +522,14 @@ bool CreateSpecificEntity::operator()(Automaton::RecognizerMatch& match, } else { - AnnotationGraphVertex src = *(matches.begin()); - annotationData->annotate( src, agv, Common::Misc::utf8stdstring2limastring("belongstose"), 1); + if( recoData->hasVertexAsEmbededEntity((*matchIt).m_elem.first) ) + { +#ifdef DEBUG_LP + LDEBUG << "CreateSpecificEntity::operator(): vertex " << *(matches.begin()) << " is embeded"; +#endif + AnnotationGraphVertex src = *(matches.begin()); + annotationData->annotate( agv, src, Common::Misc::utf8stdstring2limastring("holds"), 1); + } } } @@ -887,9 +853,11 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& vertex, AnalysisContent& analysis) const { +#ifdef DEBUG_LP SELOGINIT; LDEBUG << "SetEntityFeature:: (one argument) start... "; LDEBUG << "SetEntityFeature::(feature:" << m_featureName << ", vertex:" << vertex << ")"; +#endif // get RecognizerData: the data in which the features are stored RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData==0) { @@ -909,7 +877,9 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, } switch (m_featureType) { case QVariant::String: +#ifdef DEBUG_LP LDEBUG << "SetEntityFeature:: recoData->setEntityFeature(feature:" << m_featureName << ", featureValue:" << featureValue<< ")"; +#endif recoData->setEntityFeature(m_featureName,featureValue); break; case QVariant::Int: @@ -942,15 +912,18 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& analysis) const { +#ifdef DEBUG_LP SELOGINIT; // LERROR << "SetEntityFeature:: Error: version with two vertices parameters is not implemented"; // return false; LDEBUG << "SetEntityFeature:: (two arguments) start... "; LDEBUG << "SetEntityFeature::(feature:" << m_featureName << ", v1:" << v1 << ", v2:" << v2 << ")"; +#endif // get RecognizerData: the data in which the features are stored RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData==0) { + SELOGINIT; LERROR << "SetEntityFeature:: Error: missing RecognizerData"; return false; } @@ -985,8 +958,10 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, } } } - if( nbEdges > 1 ) + if( nbEdges > 1 ) { + SELOGINIT; LWARN << "SetEntityFeature:: Warning: ambiguïties in graph"; + } Token* token=get(vertex_token,lGraph,v); if (v == v1) { @@ -1025,6 +1000,54 @@ operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, return true; } +//---------------------------------------------------------------------------------------- +// AddEntityFeatureAsEntity : assert the the vertex is a named entity. +// Add it to the list of components as an embeded entity (the list is used to create the link +// "holds" between the annotation of the embeded and the embedding entity. +// Remember the embedding entity is no yet created. + +AddEntityFeatureAsEntity::AddEntityFeatureAsEntity(MediaId language, + const LimaString& complement): +ConstraintFunction(language,complement), +m_featureName(""), +m_featureType(QVariant::UserType) +{ + if (complement.size()) { + QStringList complementElements = complement.split(":"); + m_featureName=complementElements.front().toUtf8().constData(); + complementElements.pop_front(); + if (!complementElements.empty()) { +#ifdef DEBUG_LP + SELOGINIT; + LERROR << "AddEntityFeatureAsEntity::AddEntityFeatureAsEntity(): no type specification authorized for the feature (" + << complementElements << ") the feature type is the type of the entity"; +#endif + } + } +} + +bool AddEntityFeatureAsEntity:: +operator()(const LinguisticAnalysisStructure::AnalysisGraph& /* unused graph */, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis) const +{ +#ifdef DEBUG_LP + SELOGINIT; + LDEBUG << "AddEntityFeatureAsEntity:: (one argument) start... "; + LDEBUG << "AddEntityFeatureAsEntity::(feature:" << m_featureName << ", vertex:" << vertex << ")"; +#endif + // get RecognizerData: the data in which the features are stored + RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); + if (recoData==0) { + SELOGINIT; + LERROR << "AddEntityFeatureAsEntity:: Error: missing RecognizerData"; + return false; + } + // add the vertex to the list of embeded named entities + recoData->addVertexAsEmbededEntity(vertex); + return true; +} + //---------------------------------------------------------------------------------------- // AddEntityFeature : add a value for a given feature to the recognized entity // we do not have direct access to the RecognizerMatch of the entity when calling this function @@ -1404,8 +1427,20 @@ SELOGINIT; LERROR << "NormalizeEntity:: Error: missing RecognizerData"; return false; } -// assign stored features to RecognizerMatch features -match.features()=recoData->getEntityFeatures(); +// assign stored features to RecognizerMatch features (preserving DEFAULT_ATTIBUTE) +//match.features()=recoData->getEntityFeatures(); +#ifdef ANTINNO_SPECIFIC +Q_FOREACH (const auto& f, recoData->getEntityFeatures()) { +#else +for (const auto& f: recoData->getEntityFeatures()) { +#endif + match.features().addFeature(f.getName(),f.getValue()); + EntityFeatures::iterator featureIt = match.features().findLast(f.getName()); + if( f.getPosition() != UNDEFPOSITION ) { + (*featureIt).setPosition(f.getPosition()); + (*featureIt).setLength(f.getLength()); + } +} // must clear the stored features, once they are used (otherwise, will be kept for next entity) recoData->clearEntityFeatures(); return true; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h index 31e8c26a0..a4a4dec8a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesConstraints.h @@ -33,10 +33,11 @@ namespace LinguisticProcessing { namespace SpecificEntities { // ids for constraints in this file +#define isAlphaPossessiveId "isAlphaPossessive" #define isASpecificEntityId "isASpecificEntity" -#define isInSameSpecificEntityId "isInSameSpecificEntity" #define CreateSpecificEntityId "CreateSpecificEntity" #define SetEntityFeatureId "SetEntityFeature" +#define AddEntityFeatureAsEntityId "AddEntityFeatureAsEntity" #define AddEntityFeatureId "AddEntityFeature" #define AppendEntityFeatureId "AppendEntityFeature" #define ClearEntityFeaturesId "ClearEntityFeatures" @@ -45,44 +46,27 @@ namespace SpecificEntities { /** @author Benoit Mathieu */ -class LIMA_SPECIFICENTITIES_EXPORT isASpecificEntity : public Automaton::ConstraintFunction +class LIMA_SPECIFICENTITIES_EXPORT isAlphaPossessive : public Automaton::ConstraintFunction { public: - isASpecificEntity(MediaId language, + isAlphaPossessive(MediaId language, const LimaString& complement=LimaString()); - ~isASpecificEntity() {} + ~isAlphaPossessive() {} bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v, AnalysisContent& analysis) const; - -private: - Common::MediaticData::EntityType m_type; }; -class LIMA_SPECIFICENTITIES_EXPORT isInSameSpecificEntity : public Automaton::ConstraintFunction +class LIMA_SPECIFICENTITIES_EXPORT isASpecificEntity : public Automaton::ConstraintFunction { public: - isInSameSpecificEntity(MediaId language, - const LimaString& complement=LimaString()); - ~isInSameSpecificEntity() {} - - /** @brief Tests if the two given vertices are in the same specific entity - * - * There is several cases: - * - va1 and va2 are SE vertices : true iff va1 == va2 - * - va1 and va2 are standard vertices : true iff there is an outgoing edge in - * the annotation graph annotated with "belongstose" from each of them and - * toward the same vertex - * - va1 (va2) is a SE vertex and there is an outgoing edge in the annotation - * graph annotated with "belongstose" from va2 (va1) to va1 (va2). - * - * In all the cases, va1 and va2 are the uniq "morphannot" matches of v1 and v2 - */ + isASpecificEntity(MediaId language, + const LimaString& complement=LimaString()); + ~isASpecificEntity() {} bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, - const LinguisticGraphVertex& v1, - const LinguisticGraphVertex& v2, + const LinguisticGraphVertex& v, AnalysisContent& analysis) const; - + private: Common::MediaticData::EntityType m_type; }; @@ -166,6 +150,27 @@ class LIMA_SPECIFICENTITIES_EXPORT SetEntityFeature : public Automaton::Constrai QVariant::Type m_featureType; }; +/** + * @brief This action add a vertex as an embeded entity + * of the entity (i.e. during the rule matching process). + * + */ +class LIMA_SPECIFICENTITIES_EXPORT AddEntityFeatureAsEntity : public Automaton::ConstraintFunction +{ +public: + AddEntityFeatureAsEntity(MediaId language, + const LimaString& complement=LimaString()); + ~AddEntityFeatureAsEntity() {} + bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& vertex, + AnalysisContent& analysis) const; + +private: + std::string m_featureName; + Common::MediaticData::EntityType m_type; + QVariant::Type m_featureType; +}; + /** * @brief This action set the value of a feature for an entity during the recognition * of the entity (i.e. during the rule matching process). diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp index 29bd807d5..ee00ad454 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesLoader.cpp @@ -28,6 +28,7 @@ #include "SpecificEntitiesLoader.h" #include "SpecificEntitiesConstraints.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "linguisticProcessing/core/Automaton/recognizerMatch.h" #include "linguisticProcessing/core/Automaton/recognizerData.h" @@ -77,8 +78,8 @@ init(Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfigurati deque modex=unitConfiguration.getListsValueAtKey("modex"); for (deque::const_iterator it=modex.begin(),it_end=modex.end();it!=it_end;it++) { LDEBUG << "loader: initialize modex " << *it; - string filename=Common::MediaticData::MediaticData::single().getConfigPath()+"/"+*it; - Common::XMLConfigurationFiles::XMLConfigurationFileParser parser(filename); + QString filename = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),(*it).c_str()); + Common::XMLConfigurationFiles::XMLConfigurationFileParser parser(filename.toUtf8().constData()); Common::MediaticData::MediaticData::changeable().initEntityTypes(parser); } } @@ -114,7 +115,11 @@ process(AnalysisContent& analysis) const SpecificEntitiesLoader::XMLHandler handler(m_language,analysis,graph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); +#ifdef ANTINNO_SPECIFIC QFile file(getInputFile(analysis).c_str()); +#else + QFile file(getInputFile(analysis)); +#endif if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp index 1f5c31588..903013dc8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesMicros.cpp @@ -64,23 +64,30 @@ init(GroupConfigurationStructure& unitConfiguration, MediaId language=manager->getInitializationParameters().language; const PropertyManager& microManager = static_cast(MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO"); - const map >& entities= - unitConfiguration.getLists(); + const map >& entities = unitConfiguration.getLists(); + #ifdef DEBUG_LP + LDEBUG << "entities.size() " << entities.size(); + #endif - for (map >::const_iterator it=entities.begin(), - it_end=entities.end(); it!=it_end; it++) { + for (auto it=entities.begin(), it_end=entities.end(); it!=it_end; it++) { LimaString entityName=Common::Misc::utf8stdstring2limastring((*it).first); #ifdef DEBUG_LP LDEBUG << "Adding categories to entity " << entityName; #endif try { EntityType type=static_cast(MediaticData::single()).getEntityType(entityName); - for (deque::const_iterator micro=(*it).second.begin(), - micro_end=(*it).second.end(); micro!=micro_end; micro++) { + for (auto micro=(*it).second.begin(), micro_end=(*it).second.end(); micro!=micro_end; micro++) { + LinguisticCode code = microManager.getPropertyValue(*micro); + if (code == 0) { + SELOGINIT; + LERROR << "SpecificEntitiesMicros::init on entity" << entityName << "," << *micro << "linguistic code is not defined for language" << MediaticData::single().getMediaId(language); + } + else { #ifdef DEBUG_LP - LDEBUG << "Adding " << *micro << microManager.getPropertyValue(*micro) << " to EntityType " << type; + LDEBUG << "Adding " << *micro << code << " to EntityType " << type; #endif - m_micros[type].insert(microManager.getPropertyValue(*micro)); + m_micros[type].insert(code); + } } } catch (LimaException& e) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp index aebad4304..19901421e 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesRecognizer.cpp @@ -183,7 +183,6 @@ LimaStatusCode SpecificEntitiesRecognizer::process( LinguisticGraph* graph=anagraph->getGraph(); std::queue toVisit; VertexTokenPropertyMap tokenMap=get(vertex_token,*graph); - VertexDataPropertyMap DataMap=get(vertex_data,*graph); std::set visited; try diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp index 36ed408d0..2bbbd1162 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.cpp @@ -215,7 +215,7 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( } const SpecificEntityAnnotation* annot=getSpecificEntityAnnotation(v,annotationData); if (annot != 0) { - outputEntity(out,v,annot,tokenMap,offset); + outputEntity(annotationData,out,v,annot,tokenMap,offset); } } } @@ -251,7 +251,7 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( continue; } v = annotationData->intAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph)); - outputEntity(out,v,annot,tokenMap,offset); + outputEntity(annotationData,out,v,annot,tokenMap,offset); } } } @@ -270,18 +270,20 @@ LimaStatusCode SpecificEntitiesXmlLogger::process( } void SpecificEntitiesXmlLogger:: -outputEntity(std::ostream& out, - LinguisticGraphVertex v, - const SpecificEntityAnnotation* annot, - const VertexTokenPropertyMap& tokenMap, - uint64_t offset) const +outputEntity( AnnotationData* annotationData, + std::ostream& out, + LinguisticGraphVertex v, + const SpecificEntityAnnotation* annot, + const VertexTokenPropertyMap& tokenMap, + uint64_t offset) const { LinguisticAnalysisStructure::Token* vToken = tokenMap[v]; // LDEBUG << "SpecificEntitiesXmlLogger tokenMap[" << v << "] = " << vToken; if (vToken == 0) { SELOGINIT; - LERROR << "Vertex " << v << " has no entry in the analysis graph token map. This should not happen !!"; + LERROR << "SpecificEntitiesXmlLogger::outputEntity: Vertex " << v + << " has no entry in the analysis graph token map. This should not happen !!"; } else { @@ -307,13 +309,61 @@ outputEntity(std::ostream& out, featureItr!=features_end; featureItr++) { if( featureItr->getPosition() != UNDEFPOSITION ) { - out << "<" << featureItr->getName(); + out << "<" << featureItr->getName(); out << " pos=\"" << featureItr->getPosition() << "\""; out << " len=\"" << featureItr->getLength() << "\""; - out << ">"; - out << Common::Misc::limastring2utf8stdstring(Common::Misc::transcodeToXmlEntities(Common::Misc::utf8stdstring2limastring(featureItr->getValueString()))) - << "getName() << ">"; - } + out << ">"; + out << Common::Misc::limastring2utf8stdstring(Common::Misc::transcodeToXmlEntities(Common::Misc::utf8stdstring2limastring(featureItr->getValueString()))) + << "getName() << ">"; + } + } + + // TODO: Follow "belongstose" links to outputs embeded entities as components + // Get the current annotationVertex (is there any more simple solution???) + std::set< AnnotationGraphVertex > matches = annotationData->matches(m_graph,v,"annot"); + AnnotationGraphVertex va1; + std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); + for( ; it != matches.end(); it++) + { + va1=*it; + SELOGINIT; + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: get agv = " << va1; + if (annotationData->hasAnnotation(va1, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + break; + } + if( it == matches.end() ) + { + SELOGINIT; + LERROR << "SpecificEntitiesXmlLogger::outputEntity: could not find annotation of node " << v << "in LinguisticGraph"; + } + else + { + SELOGINIT; + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: agv " << va1 << " is a SpecificEntity Annotation"; + // Follow "belongstose" out_edges to get annotationVertex of embededed NE + AnnotationGraphOutEdgeIt it1, it1_end; + boost::tie(it1, it1_end) = boost::out_edges(va1, annotationData->getGraph()); + for (; it1 != it1_end; it1++) + { + if ( annotationData->intAnnotation((*it1), Common::Misc::utf8stdstring2limastring("holds"))==1) + { + AnnotationGraphVertex va2; + va2 = target(*it1, annotationData->getGraph()); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: embeded agv = " << va2; + // récupérer le noeud du graphe linguistique + LinguisticGraphVertex v2 = annotationData->intAnnotation(va2, Common::Misc::utf8stdstring2limastring(m_graph)); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: vertex in " << m_graph << " is " << v2; + // récupérer l'annotation SpecifiEntity + if (annotationData->hasAnnotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) + { + SpecificEntityAnnotation* annot2 = + annotationData->annotation(va2, Common::Misc::utf8stdstring2limastring("SpecificEntity")).pointerValue(); + LDEBUG << "SpecificEntitiesXmlLogger::outputEntity: annot2 = " << annot2; + outputEntity(annotationData,out, v2, annot2, tokenMap, offset); + break; + } + } + } } out << "" << ""; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h index 6a9e25dbc..a91accf65 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SpecificEntities/SpecificEntitiesXmlLogger.h @@ -65,7 +65,8 @@ class LIMA_SPECIFICENTITIES_EXPORT SpecificEntitiesXmlLogger : public AbstractTe const SpecificEntityAnnotation* getSpecificEntityAnnotation(LinguisticGraphVertex v, const Common::AnnotationGraphs::AnnotationData* annotationData) const; - void outputEntity(std::ostream& out, + void outputEntity(Common::AnnotationGraphs::AnnotationData* annotationData, + std::ostream& out, LinguisticGraphVertex v, const SpecificEntityAnnotation* annot, const VertexTokenPropertyMap& tokenMap, diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp index 63d4fc55f..0176e6b3a 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/ChainsDisambiguator.cpp @@ -180,7 +180,7 @@ void ChainsDisambiguator::computePaths() const std::set< ChainIdStruct >& nextVertexChains = chainsMap[nextVertex]; LinguisticCode nextMicroCateg(0); const MorphoSyntacticData* nextData = dataMap[nextVertex]; - if (nextData == 0) + if (nextData == 0 || nextData->empty()) { SADLOGINIT; LWARN << "vertex " << nextVertex << " has no data"; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp index e62545eae..509091067 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/DotDependencyGraphWriter.cpp @@ -350,7 +350,7 @@ void DotDependencyGraphWriter::write_graphviz( } os << v << " -> " << next << " "; - LDEBUG << "PosTaggingDepGraphEdgeWriter for "< " << next; + LTRACE << "PosTaggingDepGraphEdgeWriter for "< " << next; PosTaggingDepGraphEdgeWriter(&lposgraph,m_language,depGraph,syntacticData)(os,*outItr); } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp index d3659a79a..401495fae 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.cpp @@ -51,6 +51,7 @@ #include #include #include +#include //using namespace boost; using namespace Lima::Common::MediaticData; @@ -97,6 +98,9 @@ CreateRelationReverseWithRelatedFactory(CreateRelationReverseWithRelatedId); Automaton::ConstraintFunctionFactory CopyRelationsOutOfToFactory(CopyRelationsOutOfToId); +Automaton::ConstraintFunctionFactory +CopyIncomingRelationsToFactory(CopyIncomingRelationsToId); + Automaton::ConstraintFunctionFactory CreateCompoundTenseFactory(CreateCompoundTenseId); @@ -160,13 +164,10 @@ bool SecondUngovernedBy::operator()( const LinguisticGraphVertex& v2, AnalysisContent& analysis ) const { -/* - Critical Function : comment logging messages -*/ -// SAPLOGINIT; -// LDEBUG << "testing SecondUngovernedBy for " -// << v1 << " and " << v2 -// << " with relation: " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SecondUngovernedBy for " << v1 << " and " << v2 << " with relation: " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -219,12 +220,10 @@ bool GovernorOf::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v1, AnalysisContent& analysis) const { -/* - Critical function : comment logging messages -*/ -// SAPLOGINIT; -// LDEBUG << "testing GovernorOf for " << v1 -// << " with relation : " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing GovernorOf for " << v1 << " with relation : " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -262,12 +261,10 @@ bool GovernedBy::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v1, AnalysisContent& analysis) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing GovernedBy for " << v1 -// << " with relation: " << m_relation; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing GovernedBy for " << v1 << " with relation: " << m_relation; +#endif const SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); if (v1 == graph.firstVertex() || v1 == graph.lastVertex() ) { @@ -306,12 +303,10 @@ bool SameNominalChain::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& /*ac*/) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing SameNominalChain for " << v1 << " and " << v2 -// ; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SameNominalChain for " << v1 << " and " << v2; +#endif CVertexChainIdPropertyMap map = get(vertex_chain_id, *(graph.getGraph())); VertexChainIdProp::const_iterator it1 = map[v1].begin(); @@ -352,13 +347,10 @@ bool SameVerbalChain::operator()(const AnalysisGraph& graph, const LinguisticGraphVertex& v2, AnalysisContent& /*ac*/) const { -/* - Critical function : comment logging message -*/ - // return graph.SameVerbalChain(v1, v2, false); - -// SAPLOGINIT; -// LDEBUG << "testing SameVerbalChain for " << v1 << " and " << v2; +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing SameVerbalChain for " << v1 << " and " << v2; +#endif CVertexChainIdPropertyMap map = get(vertex_chain_id, *(graph.getGraph())); VertexChainIdProp::const_iterator it1 = map[v1].begin(); VertexChainIdProp::const_iterator it1_end = map[v1].end(); @@ -402,12 +394,11 @@ bool CreateRelationBetween::operator()(const AnalysisGraph&, const LinguisticGraphVertex& v2, AnalysisContent& analysis ) const { -/* - Critical function : comment logging message -*/ -// SAPLOGINIT; -// LDEBUG << "testing CreateRelationBetween for " << v1 << " and " -// << v2 << " with relation: " << static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(m_relation); +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "testing CreateRelationBetween for " << v1 << " and " + << v2 << " with relation: " << static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(m_relation); +#endif SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); bool res = syntacticData->relation(v1, v2, m_relation); // LDEBUG << "CreateRelationBetween: " << (res?"yes":"no"); @@ -464,13 +455,14 @@ bool RemoveOutRelationFrom::operator()(const AnalysisGraph& graph, CopyRelationsOutOfTo::CopyRelationsOutOfTo(MediaId language, const LimaString& complement): - ConstraintWithRelationComplement(language,complement) + Automaton::ConstraintFunction(language,complement), + m_relations(complement.split(",")) { /* Critical function : comment logging message */ -// SAPLOGINIT; -// LDEBUG << "CopyRelationsOutOfTo::CopyRelationsOutOfTo" << language << complement << m_relation; + SAPLOGINIT; + LDEBUG << "CopyRelationsOutOfTo::CopyRelationsOutOfTo" << language << complement << m_relations; } bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, @@ -481,8 +473,8 @@ bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, /* Critical function : comment logging message */ -// SAPLOGINIT; -// LDEBUG << "CopyRelationsOutOfTo" << v1 << v2; + SAPLOGINIT; + LDEBUG << "CopyRelationsOutOfTo" << v1 << v2; SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); if ( v1 == graph.firstVertex() || v1 == graph.lastVertex() || v2 == graph.firstVertex() || v2 == graph.lastVertex() ) @@ -494,19 +486,80 @@ bool CopyRelationsOutOfTo::operator()(const AnalysisGraph& graph, DependencyGraphVertex dv1 = syntacticData-> depVertexForTokenVertex(v1); DependencyGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = out_edges(dv1, *(syntacticData-> dependencyGraph())); - bool res = true; + bool res = false; for (; it != it_end; it++) { - LinguisticGraphVertex target = syntacticData->tokenVertexForDepVertex(boost::target(*it,*(syntacticData-> dependencyGraph()))); + QString relation = QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(map[*it]).c_str()); + LDEBUG << "CopyRelationsOutOfTo" << relation << m_relations; + if (m_relations.contains(relation)) + { + LDEBUG << "CopyRelationsOutOfTo copying" << relation; + LinguisticGraphVertex target = syntacticData->tokenVertexForDepVertex(boost::target(*it,*(syntacticData-> dependencyGraph()))); - res = syntacticData->relation(v2, target, map[*it]); - if (!res) break; + if (syntacticData->relation(v2, target, map[*it])) + res = true; + } } -// LDEBUG << "CopyRelationsOutOfTo:" << res; - return res; + LDEBUG << "CopyRelationsOutOfTo:" << res; + return true; } +//********************************************************************** + +CopyIncomingRelationsTo::CopyIncomingRelationsTo(MediaId language, + const LimaString& complement): + Automaton::ConstraintFunction(language,complement), + m_relations(complement.split(",")) +{ +/* + Critical function : comment logging message +*/ + SAPLOGINIT; + LDEBUG << "CopyIncomingRelationsTo::CopyIncomingRelationsTo" << language << complement << m_relations; +} + +bool CopyIncomingRelationsTo::operator()(const AnalysisGraph& graph, + const LinguisticGraphVertex& v1, + const LinguisticGraphVertex& v2, + AnalysisContent& analysis) const +{ +/* + Critical function : comment logging message +*/ + SAPLOGINIT; + LDEBUG << "CopyIncomingRelationsTo" << v1 << v2; + SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); + if ( v1 == graph.firstVertex() || v1 == graph.lastVertex() + || v2 == graph.firstVertex() || v2 == graph.lastVertex() ) + { + LDEBUG << "CopyIncomingRelationsTo: false"; + return false; + } + EdgeDepRelTypePropertyMap map = get(edge_deprel_type, *(syntacticData-> dependencyGraph())); + + DependencyGraphVertex dv1 = syntacticData-> depVertexForTokenVertex(v1); + DependencyGraphInEdgeIt it, it_end; + boost::tie(it, it_end) = in_edges(dv1, *(syntacticData-> dependencyGraph())); + bool res = false; + for (; it != it_end; it++) + { + LinguisticGraphVertex source = syntacticData->tokenVertexForDepVertex(boost::source(*it,*(syntacticData-> dependencyGraph()))); + QString relation = QString::fromUtf8(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language_id)).getSyntacticRelationName(map[*it]).c_str()); + LDEBUG << "CopyIncomingRelationsTo" << relation << m_relations; + if (m_relations.contains(relation)) + { + LDEBUG << "CopyIncomingRelationsTo copying" << relation; + if (syntacticData->relation(source, v2, map[*it])) + res = true; + } + } + + LDEBUG << "CopyIncomingRelationsTo:" << res; + return true; +} + + //********************************************************************** @@ -711,7 +764,7 @@ bool CreateRelationReverseWithRelated::operator()( //********************************************************************** // complement contains symbols for category and microcategory -// (e.g.: L_NC;L_NC_GEN;) +// (e.g.: NC;NC_GEN;) CreateCompoundTense::CreateCompoundTense(MediaId language, const LimaString& complement): ConstraintFunction(language,complement), @@ -719,6 +772,10 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, m_micro(0), m_tempCompType(0) { +#ifdef DEBUG_LP + SAPLOGINIT; + LDEBUG << "CreateCompoundTense::CreateCompoundTense()" << language << complement; +#endif const std::string str= Common::Misc::limastring2utf8stdstring(complement); @@ -728,7 +785,16 @@ CreateCompoundTense::CreateCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); +#ifdef ANTINNO_SPECIFIC + // Attention, si on passe aux, il faut modifier mm common de la langue en conséquence + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); +#else + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); +#endif + +#ifdef DEBUG_LP + LDEBUG << "CreateCompoundTense::CreateCompoundTense() m_tempCompType" << m_tempCompType; +#endif m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); m_microAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); @@ -744,13 +810,9 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, const LinguisticGraphVertex& auxVertex, AnalysisContent& analysis ) const { -/* - Critical function : comment logging message -*/ #ifdef DEBUG_LP SAPLOGINIT; - LDEBUG << "creating compound tense for " << auxVertex << " and " - << pastPartVertex; + LDEBUG << "creating compound tense for " << auxVertex << " and " << pastPartVertex; #endif SyntacticData* syntacticData=static_cast(analysis.getData("SyntacticData")); @@ -773,7 +835,19 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, Token* tokenAux = tokenMap[auxVertex]; Token* tokenPastPart = tokenMap[pastPartVertex]; const MorphoSyntacticData* dataAux = dataMap[auxVertex]; + if (dataAux->empty()) + { + SAPLOGINIT; + LERROR << "CreateCompoundTense::operator() morphosyntactic data is empty for aux. Abort."; + return false; + } const MorphoSyntacticData* dataPastPart = dataMap[pastPartVertex]; + if (dataPastPart->empty()) + { + SAPLOGINIT; + LERROR << "CreateCompoundTense::operator() morphosyntactic data is empty for past participle. Abort."; + return false; + } LinguisticCode dataAuxMicro = dataAux->firstValue(*m_microAccessor); LinguisticCode tense = static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).compoundTense(dataAuxMicro, dataAux->firstValue(*m_timeAccessor)); @@ -814,9 +888,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, // creer un MorphoSyntacticData #ifdef DEBUG_LP - LDEBUG << "Creating a DicoWord: " - << int(m_macro) << " / " << Common::Misc::limastring2utf8stdstring(verbFlex) << " / " - << int(m_micro) << " / " << verbLemma; + LDEBUG << "Creating a DicoWord: " << m_macro << " / " << verbFlex << " / " << m_micro << " / " << verbLemma; #endif MorphoSyntacticData* dataNewVerb = new MorphoSyntacticData(); /// if the anagraph is not set to delete the morphosyntactic data, we have to do it @@ -1025,7 +1097,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, } // copier vers le noeud du nouveau verbe toutes les relations de - // dépendance (sauf TEMPCOMP) qui avaient pour source ou destination + // dépendance (sauf aux) qui avaient pour source ou destination // l'auxiliaire ou le participe passé EdgeDepRelTypePropertyMap edgeTypeMap = get( edge_deprel_type, depGraph); @@ -1125,7 +1197,7 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, m_tempCompType); #ifdef DEBUG_LP - LDEBUG << "CreateCompoundTense: " << res; + LDEBUG << "CreateCompoundTense: " << m_tempCompType << res; #endif RecognizerData* recoData=static_cast(analysis.getData("RecognizerData")); if (recoData == 0) @@ -1133,6 +1205,9 @@ bool CreateCompoundTense::operator()(const AnalysisGraph& anagraph, recoData = new RecognizerData(); analysis.setData("RecognizerData", recoData); } +#ifdef DEBUG_LP + LDEBUG << "CreateCompoundTense setNextVertex:" << newVertex; +#endif recoData->setNextVertex(newVertex); #ifdef DEBUG_LP @@ -1160,7 +1235,7 @@ CreateEasyCompoundTense::CreateEasyCompoundTense(MediaId language, size_t secondSepPos = str.find_first_of(';', firstSepPos+1); m_micro=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(str.substr(firstSepPos + 1, secondSepPos - firstSepPos - 1)); - m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("TEMPCOMP"); + m_tempCompType=static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getSyntacticRelationId("aux"); m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); m_microAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(language)).getPropertyCodeManager().getPropertyAccessor("MICRO")); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h index e61c6129c..9bc25fba8 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/HomoSyntagmaticConstraints.h @@ -57,6 +57,7 @@ namespace SyntacticAnalysis { #define CreateRelationWithRelatedId "CreateRelationWithRelated" #define CreateRelationReverseWithRelatedId "CreateRelationReverseWithRelated" #define CopyRelationsOutOfToId "CopyRelationsOutOfTo" +#define CopyIncomingRelationsToId "CopyIncomingRelationsTo" #define CreateCompoundTenseId "CreateCompoundTense" #define CreateEasyCompoundTenseId "CreateEasyCompoundTense" #define FindRelationFromId "FindRelationFrom" @@ -243,6 +244,10 @@ class LIMA_SYNTACTICANALYSIS_EXPORT CreateRelationWithRelated : public Automaton * @brief This constraint add in the relations buffer the relations of the given * type from the targets of relations out of v2 of the given types to v1. * + * It allows to draw a relation (of type the last element in the complement + * list) from the target (v1) of the given relations (all except the last in the + * complement list) to the trigger. + * * The complement must be of the form: * "rel2|…|reln,rel1" * with rel1 the type of the relation to create and rel2, …, reln the types of @@ -286,9 +291,9 @@ class LIMA_SYNTACTICANALYSIS_EXPORT RemoveOutRelationFrom : public ConstraintWit }; /** - *@brief Copy all relations out of v1 t relations out of v2. Targets and types are kept. + *@brief Copy all relations out of v1 to relations out of v2. Targets and types are kept. */ -class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public ConstraintWithRelationComplement +class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public Automaton::ConstraintFunction { public: explicit CopyRelationsOutOfTo(MediaId language, @@ -300,9 +305,34 @@ class LIMA_SYNTACTICANALYSIS_EXPORT CopyRelationsOutOfTo : public ConstraintWith AnalysisContent& analysis) const; private: + QStringList m_relations; }; -/** @brief This constraint creates a TEMPCOMP relation between its two +/** + *@brief Copy all relations incoming to v1 to relations incoming to of v2. Targets and types are kept. + * + * Used to recopy relations + * - pointing to the first member of a coordination (target of COORD1) to the second member + * (target of COORD2) or + * - pointing to the second member of a coordination (target of COORD2) to the first member + * (target of COORD1) + */ +class LIMA_SYNTACTICANALYSIS_EXPORT CopyIncomingRelationsTo : public Automaton::ConstraintFunction +{ +public: + explicit CopyIncomingRelationsTo(MediaId language, + const LimaString& complement=LimaString()); + ~CopyIncomingRelationsTo() {} + bool operator()(const LinguisticAnalysisStructure::AnalysisGraph& graph, + const LinguisticGraphVertex& v1, + const LinguisticGraphVertex& v2, + AnalysisContent& analysis) const; + +private: + QStringList m_relations; +}; + +/** @brief This constraint creates a aux relation between its two * parameters * * @todo It was originaly supposed to replace the two vertices (auxiliary and diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp index 4e70d54a8..fbaffe833 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalPreferences.cpp @@ -27,6 +27,7 @@ */ #include "SelectionalPreferences.h" +#include "common/tools/FileUtils.h" #include "common/Data/strwstrtools.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" #include "common/MediaticData/mediaticData.h" @@ -66,7 +67,7 @@ void SelectionalPreferences::init( try { std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string preferencesFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("file"); + std::string preferencesFileName = Common::Misc::findFileInPaths(resourcePath.c_str(), unitConfiguration.getParamsValueAtKey("file").c_str()).toUtf8().constData(); loadFromFile(preferencesFileName); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) @@ -93,8 +94,7 @@ void SelectionalPreferences::loadFromFile(const std::string& fileName) return; } - std::string line; - getline(ifl, line); + std::string line = Lima::Common::Misc::readLine(ifl); Common::Misc::chomp(line); linesCounter++; while (ifl.good() && !ifl.eof()) @@ -158,7 +158,7 @@ void SelectionalPreferences::loadFromFile(const std::string& fileName) boost::tuple< std::string, LinguisticCode, std::string, std::string, LinguisticCode > tuple(target,targetMacro,dependency,source,soureceMacro); m_preferences.insert(std::make_pair(tuple, probability)); } - getline(ifl, line); + line = Lima::Common::Misc::readLine(ifl); Common::Misc::chomp(line); linesCounter++; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp index 89781b47a..e1d6a6515 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SelectionalRestrictionsConstraints.cpp @@ -169,7 +169,12 @@ bool DisambiguateWith::operator()(const AnalysisGraph& graph, { SAPLOGINIT; LERROR << "no graph 'PosGraph' available !"; +#ifdef ANTINNO_SPECIFIC + // FWI 26/06/2016 doit retourner un booléen + return false; + #else return MISSING_DATA; +#endif } LinguisticGraph* lingGraph = const_cast(posgraph->getGraph()); // LDEBUG << "There is " << out_degree(v2, *lingGraph) << " edges out of " << v2; diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp index e11c90c24..246daed2c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalysisTools.cpp @@ -394,7 +394,7 @@ displayRelationsDistanceOfArguments(const SyntacticData& data, } else { // can be equal (l', n'...) or - // negative (TEMPCOMP -> length of auxiliary has changed) + // negative (aux -> length of auxiliary has changed) out << 0; } out << endl; @@ -467,7 +467,7 @@ void SyntacticAnalysisTools::displayRelationsXMLFormat(const SyntacticData& data } else { // can be equal (l', n'...) or - // negative (TEMPCOMP -> length of auxiliary has changed) + // negative (aux -> length of auxiliary has changed) pathDistance = 0; } xmlStream << "" diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp index c93efcf9d..e3efd6b05 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.cpp @@ -1,771 +1,863 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/** - * - * @file SyntacticAnalyzer-chains.cpp - * @author Gael de Chalendar (Gael.de-Chalendar@cea.fr) - - * Copyright (c) 2003 by CEA - * @date Created on Aug, 31 2004 - * @version $Id$ - * - */ - -#include "SyntacticAnalyzer-chains.h" -#include "SyntagmaticMatrix.h" - -#include "common/AbstractFactoryPattern/SimpleFactory.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" -#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" -#include "common/time/timeUtilsController.h" -#include "common/LimaCommon.h" - -#undef min -#undef max - -using namespace std; -//using namespace boost; -using namespace Lima::Common::MediaticData; -using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; - -namespace Lima -{ -namespace LinguisticProcessing -{ -namespace SyntacticAnalysis -{ - -static const uint64_t DEFAULT_MAXCHAINSNBBYVERTEX = 30; -static const uint64_t DEFAULT_MAXCHAINLENGTH = 200; - -SimpleFactory syntacticAnalyzerChainsFactory(SYNTACTICANALYZERCHAINS_CLASSID); - -SyntacticAnalyzerChains::SyntacticAnalyzerChains() : - m_language(), - m_chainMatrix(0), - m_maxChainsNbByVertex(std::numeric_limits::max()) -{} - -void SyntacticAnalyzerChains::init( - Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, - Manager* manager) - -{ - SACLOGINIT; - m_language=manager->getInitializationParameters().media; - m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); - try - { - std::string chainMatrixId=unitConfiguration.getParamsValueAtKey("chainMatrix"); - m_chainMatrix=static_cast(LinguisticResources::single().getResource(m_language,chainMatrixId)); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LERROR << "no parameter 'chainMatrix' in SyntacticAnalyzerChains group for language " << (int) m_language << " !"; - throw InvalidConfiguration(); - } - try - { - std::string maxChainsNbByVertexS=unitConfiguration.getParamsValueAtKey("maxChainsNbByVertex"); - std::istringstream iss(maxChainsNbByVertexS); - iss >> m_maxChainsNbByVertex; - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no parameter 'maxChainsNbByVertex' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<> m_maxChainLength; - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "no parameter 'maxChainLength' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue(id); - } - catch (Common::XMLConfigurationFiles::NoSuchParam& ) - { - LWARN << "No ponctu macrocategory defined ! use category PONCTU"; - m_ponctuCategory=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue("PONCTU"); - } - -} - -LimaStatusCode SyntacticAnalyzerChains::process( - AnalysisContent& analysis) const -{ - Lima::TimeUtilsController timer("SyntacticAnalysis"); - SACLOGINIT; - LINFO << "start syntactic analysis - chains"; - // create syntacticData - AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); - if (anagraph==0) - { - LERROR << "no PosGraph ! abort"; - return MISSING_DATA; - } - SegmentationData* sb=static_cast(analysis.getData("SentenceBoundaries")); - if (sb==0) - { - LERROR << "no sentence bounds ! abort"; - return MISSING_DATA; - } - if (sb->getGraphId() != "PosGraph") { - LERROR << "SentenceBounds have been computed on " << sb->getGraphId() << " !"; - LERROR << "SyntacticAnalyzer-deps needs SentenceBounds on PosGraph"; - return INVALID_CONFIGURATION; - } - - SyntacticData* syntacticData=dynamic_cast(analysis.getData("SyntacticData")); - if (syntacticData==0) - { - syntacticData=new SyntacticData(anagraph,m_chainMatrix); - analysis.setData("SyntacticData",syntacticData); - } - else if (syntacticData->matrices() == 0) - { - syntacticData->matrices(m_chainMatrix); - } - syntacticData->setupDependencyGraph(); - - uint64_t chainId = m_firstChainId; - std::list ponctuMacroFilter; - ponctuMacroFilter.push_back(m_ponctuCategory); - -// bool l2r = true; - // ??OME2 for (SegmentationData::const_iterator boundItr=sb->begin(); - // boundItr!=sb->end(); - for (std::vector::const_iterator boundItr=(sb->getSegments()).begin(); - boundItr!=(sb->getSegments()).end(); - boundItr++) - { - LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); - LinguisticGraphVertex endSentence=boundItr->getLastVertex(); -// LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; - LinguisticGraphVertex current, next; - current = beginSentence; next = current; - while (next != endSentence) - { -// LDEBUG << "nextChainsBreak"; - next = anagraph->nextMainPathVertex(current,*m_macroAccessor,ponctuMacroFilter,endSentence); -// LDEBUG << "analyze chain from " << current << " to " << next; -// LDEBUG << "identify chains"; - identifyChains(syntacticData,current,next,chainId); - current = next; - } - beginSentence=endSentence; - } - - LINFO << "end syntactic analysis - chains"; - return SUCCESS_ID; -} - - -void SyntacticAnalyzerChains::identifyChains(SyntacticData* data, - const LinguisticGraphVertex& start, - const LinguisticGraphVertex& stop, - uint64_t& startChainId) const -{ -// SACLOGINIT; -// LDEBUG << "Searching chains from/to (morph): " << start << "/" << stop; - if (start == stop) - return; - VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); - std::set< std::string > alreadyReported; - LinguisticGraphVertex first = data->iterator()-> firstVertex(); - LinguisticGraphVertex last = data->iterator()-> lastVertex(); - VertexDataPropertyMap dataMap = get(vertex_data, (*data->iterator()->getGraph() ) ); -// VertexTokenPropertyMap tokenMap =get(vertex_token, (*data->iterator()->getGraph() ) ); - - std::vector< ChainStackTuple > pile; - // std::stack< LinguisticGraphVertex > pileSons; - Common::MediaticData::ChainsType currentType = Common::MediaticData::NO_CHAIN_TYPE; -// std::stack< std::pair< std::deque< ChainStackTuple >, std::stack< LinguisticGraphVertex > > > tank; - std::vector< std::vector< ChainStackTuple > > tank; - std::set< LinguisticGraphVertex > alreadyFinished; - std::vector nextVxs; -// LDEBUG << "Initializing nextVxs with " << start; - nextVxs.push_back(start); - - - while (! ( tank.empty() && nextVxs.empty()) ) - { -// LDEBUG << "LOOP"; - if (pile.size() >= m_maxChainLength) - { -#ifdef DEBUG_LP - SACLOGINIT; - LNOTICE << "Chain reached its max size or is too long."; -#endif -// LDEBUG << "Trying to find a chain end in the too long stack"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx != first) { -// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; - std::string newChainString = stringChain(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); - alreadyReported.insert(newChainString); - reportChainInGraph(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } - else { -// LDEBUG << "NoChainEndInStack"; - } - if ( ! tank.empty() ) - { -// LDEBUG << "Using a new stack after chain too long"; -// boost::tie(pile, pileSons) = tank.back(); - pile = tank.back(); - tank.pop_back(); - } - } - else if (tank.empty()) - { -// LDEBUG << "tank is empty"; - LinguisticGraphVertex nextVx = nextVxs.back(); - nextVxs.pop_back(); - while (alreadyFinished.find(nextVx) != alreadyFinished.end()) - { - if (nextVxs.empty()) - { -// LDEBUG << "Nothing more to work on: returning"; - return; - } -// LDEBUG << "Ignoring next vertex " << nextVx << " because it is already finished."; - nextVx = nextVxs.back(); - nextVxs.pop_back(); - while ((vertexChainIdMap[nextVx].size() >= m_maxChainsNbByVertex) ) - { - SACLOGINIT; - LNOTICE << "Vertex ignored (" << nextVx << ") because there is too much chains on it."; -// LDEBUG << "Ignoring next vertex " << nextVx << " because there is too much chains on it."; - if (nextVxs.empty()) - { -// LDEBUG << "Nothing more to work on: returning"; - return; - } - nextVx = nextVxs.back(); - nextVxs.pop_back(); - } - } -// LDEBUG << "next vertex is " << nextVx; - bool canFinish = false; - pile.clear(); -// pileSons = std::stack< LinguisticGraphVertex >(); - if ( (nextVx != first) && (nextVx != last) && - ( data->matrices()->canNominalChainBeginBy(dataMap[nextVx]) ) ) - { -// LDEBUG << "next vertex is a nominal chain beginning"; - canFinish = (data->matrices()-> canNominalChainEndBy(dataMap[nextVx])); - pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); - currentType = NOMINAL; - } - else if ( (nextVx != first) && (nextVx != last) && - ( data->matrices()-> canVerbalChainBeginBy(dataMap[nextVx]) ) ) - { -// LDEBUG << "next vertex is a verbal chain beginning"; - canFinish = ( data->matrices()-> canVerbalChainEndBy(dataMap[nextVx])); - pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); - currentType = VERBAL; - } - else - { -// LDEBUG << "next vertex " << nextVx << " is not a chain beginning"; - currentType = NO_CHAIN_TYPE; -// LDEBUG << "Adding nextVx " << nextVx << " to alreadyFinished"; -// alreadyFinished.insert(nextVx); - } - - if (nextVx != stop) - { - std::vector< LinguisticGraphVertex > sons; - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(nextVx, *(data->graph())); - for (; it != it_end; it++) - { -// LDEBUG << "Looking at the next vertex out edge: " << *it; - LinguisticGraphVertex nextNext = target(*it, *(data->graph())); - if (nextNext != last) - { - if ( ( alreadyFinished.find(nextNext) == alreadyFinished.end()) && (currentType != NO_CHAIN_TYPE) ) - { -// LDEBUG << "Adding " << nextNext << " to sons of " << nextVx; - sons.push_back(nextNext); - } - else - { -// LDEBUG << "Adding " << nextNext << " to nextVxs"; - nextVxs.push_back(nextNext); - // The addition of the line below seems to solve a loop problem - // whithout producing regressions in TVA tests. - alreadyFinished.insert(nextVx); - } - } - } - if (!sons.empty() && !pile.empty()) - { -// LDEBUG << nextVx << " has sons: pushing them to the tank"; -// tank.push_back(std::make_pair(pile, sons)); - pile.back().get<2>() = sons; - tank.push_back(pile); - } - } - } - else - { - LinguisticGraphVertex father = pile.back().get<0>(); - LinguisticGraphVertex currentSon = pile.back().get<2>().back(); -// LDEBUG << "Father and current son are: " << father << " / " << currentSon; - pile.back().get<2>().pop_back(); - if ( (currentType == NO_CHAIN_TYPE) && (pile.empty()) ) - { - if ( data->matrices()->canNominalChainBeginBy(dataMap[currentSon])) - currentType = NOMINAL; - else if ( data->matrices()->canVerbalChainBeginBy(dataMap[currentSon])) - currentType = VERBAL; - } - - if ( currentType != NO_CHAIN_TYPE ) - { -// LDEBUG << "Current type is " << currentType; - // -------------> - // endroit ou mettre le bloc deplace - // <------------- - if ( (currentSon != last) && - ( data->matrices()-> belongsToMatrix( - dataMap[father], - dataMap[currentSon], - currentType ) ) ) - { -// LDEBUG << father << " -> " << currentSon << " is in the matrix"; - bool canFinish = ( data->matrices()->canChainEndBy(dataMap[currentSon], currentType)); - // bloc ci-dessous a deplacer plus haut pour explorer - // toutes les chaines. Pb: rend le parcours tres tres lourd. - // -------------> - if (!pile.empty() && !pile.back().get<2>().empty()) - { -// LDEBUG << father << " has remaining sons: pushing them to the tank"; -// tank.push_back(std::make_pair(pile, pileSons)); - tank.push_back(pile); - } - // <------------- -// LDEBUG << "Pushing " << currentSon << "(" << canFinish << ")"; - pile.push_back(boost::make_tuple(currentSon, canFinish, std::vector< LinguisticGraphVertex >())); - if (currentSon != stop) - { - std::vector< LinguisticGraphVertex >& sons = pile.back().get<2>(); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(currentSon, *(data->graph())); - for (; it != it_end; it++) - { -// LDEBUG << "Edge is " << *it; -// LDEBUG << "Adding " << target(*it, *(data->graph())) << " to sons of " << currentSon; - sons.push_back(target(*it, *(data->graph()))); - } - } - else - { -// LDEBUG << "Stop reached"; - if (canFinish) - { -// LDEBUG << "currentSon " << currentSon << " is a possible end. Reporting the chain in the graph."; - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,currentSon); - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId, currentSon); - } - else - { -// LDEBUG << "currentSon " << currentSon << " is not a possible end."; -// LDEBUG << "Trying to find a chain end in the stack"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx!=first) { -// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } -// else -// { -// LDEBUG << "NoChainEndInStackException catched"; -// } - } - } - } - else - { -// LDEBUG << father << " -> " << currentSon << " NOT in the matrix"; - LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); - if (lastChainVx!=first) - { - std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - if (alreadyReported.find(newChainString) == alreadyReported.end()) - { -// LDEBUG << "Reporting chain: " << newChainString; - alreadyReported.insert(newChainString); - reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); - LinguisticGraphOutEdgeIt it, it_end; - boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); -// LDEBUG << "Initializing for the sons of " << lastChainVx << " after unstacking"; - for (; it != it_end; it++) - { -// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; - LinguisticGraphVertex nextVx = target(*it, *(data->graph())); - if (alreadyFinished.find(nextVx) == alreadyFinished.end()) - { -// LDEBUG << "Adding " << nextVx << " to nextVxs"; - nextVxs.push_back(nextVx); - } - } - } -// else -// { -// LDEBUG << "This chain (" << newChainString << ") has already been found. Nothing to do."; -// } - } - else - { -// LDEBUG << "No end of chain found in pile"; - if (alreadyFinished.find(currentSon) == alreadyFinished.end()) - { - if ( parentsFinished(data, father, alreadyFinished ) ) - { -// LDEBUG << "Adding father " << father << " to alreadyFinished"; - alreadyFinished.insert(father); - } - if (currentSon != last) - { -// LDEBUG << "Adding " << currentSon << " to nextVxs"; - nextVxs.push_back(currentSon); - } - else - { -// LDEBUG << "Adding current son " << currentSon << " to alreadyFinished"; - alreadyFinished.insert(currentSon); - } - } - } - } - } - - if ( (pile.empty() || pile.back().get<2>().empty()) && (! tank.empty()) ) - { -// LDEBUG << "Using a new stack"; -// boost::tie(pile, pileSons) = tank.back(); - pile = tank.back(); - tank.pop_back(); - } - } - } -// LDEBUG << "<========= chains search finished"; -} - -void SyntacticAnalyzerChains::reportChainInGraph( - SyntacticData* data, - const std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type, - std::set< LinguisticGraphVertex >& alreadyFinished, - uint64_t& chainId, - const LinguisticGraphVertex& stop) const -{ -// SACLOGINIT; -// LDEBUG << "SyntacticAnalyzerChains::reportChainInGraph"; - - ChainIdStruct property = ChainIdStruct(type, chainId); - - VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); - - std::vector< ChainStackTuple >::const_iterator it, it_end; - it = pile.begin(); it_end = pile.end(); - for (; it != it_end; it++) - { - LinguisticGraphVertex current = (*it).get<0>(); - if ((vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) ) - { - SACLOGINIT; - LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; - return; - } - } - - - std::vector< ChainStackTuple >::const_iterator it_beg, it_last; - it = pile.begin(); it_beg = pile.begin(); - it_end = pile.end(); it_last = --(pile.end()); - std::ostringstream oss; - for (; it != it_end; it++) - { - LinguisticGraphVertex current = (*it).get<0>(); - if (it == it_beg) - { - if (it_beg == it_last) - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); - else - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); - } - else if (it == it_last) - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); - } - else - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); - } - oss << current; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() - && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) - { -// LDEBUG << "executing: vertexChainIdMap[" << current << "].insert(" << property << ")"; - vertexChainIdMap[current].insert(property); - - if (pile.size() > 1) - { - std::vector< ChainStackTuple >::const_iterator it2, it2_end; - it2 = pile.begin(); it2_end = pile.end(); - bool ok = false; - for (; it2 != it2_end; it2++) - { - LinguisticGraphVertex other = (*it2).get<0>(); - if (other != current) - { - LinguisticGraphEdge e; bool found; - boost::tie (e, found) = edge(current, other, *(data->graph())); - if (found) - { - ok = true; - break; - } - else - { - boost::tie(e, found) = edge(other, current, *(data->graph())); - if (found) - { - ok = true; - break; - } - } - } - } - if (!ok) - { - SACLOGINIT; - LWARN << "An edge should exist for " << current << " !"; - } - } - } - else if (vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) - { - SACLOGINIT; - LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; - } - if (current == stop) - break; - else - oss << " "; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() - && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) - if (parentsFinished(data, current, alreadyFinished)) - { -/* LDEBUG << "Parents of " << current << " are finished ; so it too."; - alreadyFinished.insert(current);*/ - } - } -// LDEBUG << "Chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); - chainId++; - } - -bool SyntacticAnalyzerChains::parentsFinished( - const SyntacticData* data, - const LinguisticGraphVertex& v, - const std::set< LinguisticGraphVertex >& alreadyFinished) const -{ -/* - Critical function : comment logging messages -*/ -// SACLOGINIT; -// LDEBUG << "SyntacticAnalyzerChains::parentsFinished"; - - LinguisticGraphInEdgeIt it, it_end; - boost::tie(it, it_end) = in_edges(v, *(data->graph())); - for (; it != it_end; it++) - { - if (alreadyFinished.find(source(*it, *(data->graph()))) == alreadyFinished.end()) - return false; - } - return true; -} - -std::string SyntacticAnalyzerChains::stringChain( - const SyntacticData* data, - const std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type, - std::set< LinguisticGraphVertex >& alreadyFinished, - uint64_t chainId, - const LinguisticGraphVertex& stop) const -{ -/* - Critical Function : comment logging messages -*/ -// SACLOGINIT; - ChainIdStruct property = ChainIdStruct(type, chainId); - - std::vector< ChainStackTuple >::const_iterator it, it_beg, it_end, it_last; - it = pile.begin(); it_beg = pile.begin(); - it_end = pile.end(); it_last = --(pile.end()); - std::ostringstream oss; - for (; it != it_end; it++) - { - if (it == it_beg) - { - if (it_beg == it_last) - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); - else - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); - } - else if (it == it_last) - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); - } - else - { - property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); - } - oss << (*it).get<0>(); - LinguisticGraphVertex current = (*it).get<0>(); - if (current == stop) - break; - else - oss << " "; - if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex()) - { - if (pile.size() > 1) - { - std::vector< ChainStackTuple >::const_iterator it2, it2_end; - // @todo replace by lookup only previous and next vertex in pile - it2 = pile.begin(); it2_end = pile.end(); - bool ok = false; - for (; it2 != it2_end; it2++) - { - LinguisticGraphVertex other = (*it2).get<0>(); - if (other != current) - { - LinguisticGraphEdge e; bool found; - boost::tie (e, found) = edge(current, other, *(data->graph())); - if (found) - { - ok = true; - break; - } - else - { - boost::tie (e, found) = edge(other, current, *(data->graph())); - if (found) - { - ok = true; - break; - } - } - } - } - if (!ok) - { - SALOGINIT; - LWARN << "An edge should exist for " << current << " !"; - } - } - } - if ( parentsFinished(data, current, alreadyFinished) ) - { -// LDEBUG << "Adding current " << current << " to alreadyFinished"; - alreadyFinished.insert(current); - } - } -// LDEBUG << "In stringChain, chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); - return oss.str(); -} - -LinguisticGraphVertex SyntacticAnalyzerChains::unstackUptoChainEnd( - const SyntacticData* data, - std::vector< ChainStackTuple >& pile, - Common::MediaticData::ChainsType type - ) const -{ -/* - Critical function : commeng logging messages -*/ -// SACLOGINIT; -// LDEBUG << "unstackUptoChainEnd " << (type==NOMINAL?"nominal":(type==VERBAL?"verbal":"none")); - CVertexDataPropertyMap dataMap = get( vertex_data, (*data->iterator()->getGraph()) ); - - std::vector< ChainStackTuple >::const_reverse_iterator rit, rit_end; - rit = pile.rbegin(); rit_end = pile.rend(); - for (; rit != rit_end; rit++) - { - if ( data->matrices()->canChainEndBy(dataMap[(*rit).get<0>()], type)) - break; -// LDEBUG << "chain cannot finish by " << (*rit).get<0>(); - } - - if (rit != rit_end) - { - LinguisticGraphVertex newChainEnd = (*rit).get<0>(); -// LDEBUG << "Chain end found in pile: " << newChainEnd; - return (newChainEnd); - } - else - { -// LDEBUG << "No chain end found in pile !"; - return data->iterator()->firstVertex(); - } -} - -} // closing namespace SyntacticAnalysis -} // closing namespace LinguisticProcessing -} // closing namespace Lima +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/** + * + * @file SyntacticAnalyzer-chains.cpp + * @author Gael de Chalendar (Gael.de-Chalendar@cea.fr) + + * Copyright (c) 2003 by CEA + * @date Created on Aug, 31 2004 + * @version $Id$ + * + */ + +#include "SyntacticAnalyzer-chains.h" +#include "SyntagmaticMatrix.h" + +#include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/core/LinguisticAnalysisStructure/AnalysisGraph.h" +#include "linguisticProcessing/core/TextSegmentation/SegmentationData.h" +#include "common/time/timeUtilsController.h" +#include "common/LimaCommon.h" + +#undef min +#undef max + +using namespace std; +//using namespace boost; +using namespace Lima::Common::MediaticData; +using namespace Lima::LinguisticProcessing::LinguisticAnalysisStructure; + +namespace Lima +{ +namespace LinguisticProcessing +{ +namespace SyntacticAnalysis +{ + +static const uint64_t DEFAULT_MAXCHAINSNBBYVERTEX = 30; +static const uint64_t DEFAULT_MAXCHAINLENGTH = 200; + +SimpleFactory syntacticAnalyzerChainsFactory(SYNTACTICANALYZERCHAINS_CLASSID); + +SyntacticAnalyzerChains::SyntacticAnalyzerChains() : + m_language(), + m_chainMatrix(0), + m_maxChainsNbByVertex(std::numeric_limits::max()) +{} + +void SyntacticAnalyzerChains::init( + Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration, + Manager* manager) + +{ + SACLOGINIT; + m_language=manager->getInitializationParameters().media; + m_macroAccessor=&(static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO")); + try + { + std::string chainMatrixId=unitConfiguration.getParamsValueAtKey("chainMatrix"); + m_chainMatrix=static_cast(LinguisticResources::single().getResource(m_language,chainMatrixId)); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LERROR << "no parameter 'chainMatrix' in SyntacticAnalyzerChains group for language " << (int) m_language << " !"; + throw InvalidConfiguration(); + } + try + { + std::string maxChainsNbByVertexS=unitConfiguration.getParamsValueAtKey("maxChainsNbByVertex"); + std::istringstream iss(maxChainsNbByVertexS); + iss >> m_maxChainsNbByVertex; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no parameter 'maxChainsNbByVertex' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<> m_maxChainLength; + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "no parameter 'maxChainLength' in SyntacticAnalyzerChains group for language " << (int) m_language << " ! Using default: "<(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue(id); + } + catch (Common::XMLConfigurationFiles::NoSuchParam& ) + { + LWARN << "No ponctu macrocategory defined ! use category PONCTU"; + m_ponctuCategory=static_cast(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MACRO").getPropertyValue("PONCTU"); + } + +} + +LimaStatusCode SyntacticAnalyzerChains::process( + AnalysisContent& analysis) const +{ +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + Lima::TimeUtilsController timer("SyntacticAnalysis"); + SACLOGINIT; + LINFO << "start syntactic analysis - chains"; + // create syntacticData + AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); + if (anagraph==0) + { + LERROR << "no PosGraph ! abort"; + return MISSING_DATA; + } + SegmentationData* sb=static_cast(analysis.getData("SentenceBoundaries")); + if (sb==0) + { + LERROR << "no sentence bounds ! abort"; + return MISSING_DATA; + } + if (sb->getGraphId() != "PosGraph") { + LERROR << "SentenceBounds have been computed on " << sb->getGraphId() << " !"; + LERROR << "SyntacticAnalyzer-deps needs SentenceBounds on PosGraph"; + return INVALID_CONFIGURATION; + } + + SyntacticData* syntacticData=dynamic_cast(analysis.getData("SyntacticData")); + if (syntacticData==0) + { + syntacticData=new SyntacticData(anagraph,m_chainMatrix); + analysis.setData("SyntacticData",syntacticData); + } + else if (syntacticData->matrices() == 0) + { + syntacticData->matrices(m_chainMatrix); + } + syntacticData->setupDependencyGraph(); + + uint64_t chainId = m_firstChainId; + std::list ponctuMacroFilter; + ponctuMacroFilter.push_back(m_ponctuCategory); + +// bool l2r = true; + // ??OME2 for (SegmentationData::const_iterator boundItr=sb->begin(); + // boundItr!=sb->end(); + for (std::vector::const_iterator boundItr=(sb->getSegments()).begin(); + boundItr!=(sb->getSegments()).end(); + boundItr++) + { + LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); + LinguisticGraphVertex endSentence=boundItr->getLastVertex(); +// LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; + LinguisticGraphVertex current, next; + current = beginSentence; next = current; + +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif + while (next != endSentence) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif +// LDEBUG << "nextChainsBreak"; + next = anagraph->nextMainPathVertex(current,*m_macroAccessor,ponctuMacroFilter,endSentence); + +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif +// LDEBUG << "analyze chain from " << current << " to " << next; +// LDEBUG << "identify chains"; + identifyChains(syntacticData,current,next,chainId +#ifdef ANTINNO_SPECIFIC + , stopAnalyze +#endif + ); +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return TIME_OVERFLOW; + } +#endif + current = next; + } + beginSentence=endSentence; + } + + LINFO << "end syntactic analysis - chains"; + return SUCCESS_ID; +} + + +void SyntacticAnalyzerChains::identifyChains(SyntacticData* data, + const LinguisticGraphVertex& start, + const LinguisticGraphVertex& stop, + uint64_t& startChainId, +#ifdef ANTINNO_SPECIFIC + StopAnalyze const& stopAnalyze +#endif + ) const +{ +// SACLOGINIT; +// LDEBUG << "Searching chains from/to (morph): " << start << "/" << stop; + if (start == stop) + return; + VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); + std::set< std::string > alreadyReported; + LinguisticGraphVertex first = data->iterator()-> firstVertex(); + LinguisticGraphVertex last = data->iterator()-> lastVertex(); + VertexDataPropertyMap dataMap = get(vertex_data, (*data->iterator()->getGraph() ) ); +// VertexTokenPropertyMap tokenMap =get(vertex_token, (*data->iterator()->getGraph() ) ); + + std::vector< ChainStackTuple > pile; + // std::stack< LinguisticGraphVertex > pileSons; + Common::MediaticData::ChainsType currentType = Common::MediaticData::NO_CHAIN_TYPE; +// std::stack< std::pair< std::deque< ChainStackTuple >, std::stack< LinguisticGraphVertex > > > tank; + std::vector< std::vector< ChainStackTuple > > tank; + std::set< LinguisticGraphVertex > alreadyFinished; + std::vector nextVxs; +// LDEBUG << "Initializing nextVxs with " << start; + nextVxs.push_back(start); + + + while (! ( tank.empty() && nextVxs.empty()) ) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif + // LDEBUG << "LOOP"; + if (pile.size() >= m_maxChainLength) + { +#ifdef DEBUG_LP + SACLOGINIT; + LNOTICE << "Chain reached its max size or is too long."; +#endif + +// LDEBUG << "Trying to find a chain end in the too long stack"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx != first) { +// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; + std::string newChainString = stringChain(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); + alreadyReported.insert(newChainString); + reportChainInGraph(data,pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx; + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { +#ifdef DEBUG_LP + SACLOGINIT + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; +#endif + return; + } +#endif +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } + else { +// LDEBUG << "NoChainEndInStack"; + } + if ( ! tank.empty() ) + { +// LDEBUG << "Using a new stack after chain too long"; +// boost::tie(pile, pileSons) = tank.back(); + pile = tank.back(); + tank.pop_back(); + } + } + else if (tank.empty()) + { +// LDEBUG << "tank is empty"; + LinguisticGraphVertex nextVx = nextVxs.back(); + nextVxs.pop_back(); + while (alreadyFinished.find(nextVx) != alreadyFinished.end()) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif + if (nextVxs.empty()) + { +// LDEBUG << "Nothing more to work on: returning"; + return; + } +// LDEBUG << "Ignoring next vertex " << nextVx << " because it is already finished."; + nextVx = nextVxs.back(); + nextVxs.pop_back(); + while ((vertexChainIdMap[nextVx].size() >= m_maxChainsNbByVertex) ) + { + SACLOGINIT; + LNOTICE << "Vertex ignored (" << nextVx << ") because there is too much chains on it."; +// LDEBUG << "Ignoring next vertex " << nextVx << " because there is too much chains on it."; + if (nextVxs.empty()) + { +// LDEBUG << "Nothing more to work on: returning"; + return; + } + nextVx = nextVxs.back(); + nextVxs.pop_back(); + } + } +// LDEBUG << "next vertex is " << nextVx; + bool canFinish = false; + pile.clear(); +// pileSons = std::stack< LinguisticGraphVertex >(); + if ( (nextVx != first) && (nextVx != last) && + ( data->matrices()->canNominalChainBeginBy(dataMap[nextVx]) ) ) + { +// LDEBUG << "next vertex is a nominal chain beginning"; + canFinish = (data->matrices()-> canNominalChainEndBy(dataMap[nextVx])); + pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); + currentType = NOMINAL; + } + else if ( (nextVx != first) && (nextVx != last) && + ( data->matrices()-> canVerbalChainBeginBy(dataMap[nextVx]) ) ) + { +// LDEBUG << "next vertex is a verbal chain beginning"; + canFinish = ( data->matrices()-> canVerbalChainEndBy(dataMap[nextVx])); + pile.push_back(boost::make_tuple(nextVx, canFinish, std::vector< LinguisticGraphVertex >())); + currentType = VERBAL; + } + else + { +// LDEBUG << "next vertex " << nextVx << " is not a chain beginning"; + currentType = NO_CHAIN_TYPE; +// LDEBUG << "Adding nextVx " << nextVx << " to alreadyFinished"; +// alreadyFinished.insert(nextVx); + } + + if (nextVx != stop) + { + std::vector< LinguisticGraphVertex > sons; + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(nextVx, *(data->graph())); + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << "Looking at the next vertex out edge: " << *it; + LinguisticGraphVertex nextNext = target(*it, *(data->graph())); + if (nextNext != last) + { + if ( ( alreadyFinished.find(nextNext) == alreadyFinished.end()) && (currentType != NO_CHAIN_TYPE) ) + { +// LDEBUG << "Adding " << nextNext << " to sons of " << nextVx; + sons.push_back(nextNext); + } + else + { +// LDEBUG << "Adding " << nextNext << " to nextVxs"; + nextVxs.push_back(nextNext); + // The addition of the line below seems to solve a loop problem + // whithout producing regressions in TVA tests. + alreadyFinished.insert(nextVx); + } + } + } + if (!sons.empty() && !pile.empty()) + { +// LDEBUG << nextVx << " has sons: pushing them to the tank"; +// tank.push_back(std::make_pair(pile, sons)); + pile.back().get<2>() = sons; + tank.push_back(pile); + } + } + } + else + { + LinguisticGraphVertex father = pile.back().get<0>(); + LinguisticGraphVertex currentSon = pile.back().get<2>().back(); +// LDEBUG << "Father and current son are: " << father << " / " << currentSon; + pile.back().get<2>().pop_back(); + if ( (currentType == NO_CHAIN_TYPE) && (pile.empty()) ) + { + if ( data->matrices()->canNominalChainBeginBy(dataMap[currentSon])) + currentType = NOMINAL; + else if ( data->matrices()->canVerbalChainBeginBy(dataMap[currentSon])) + currentType = VERBAL; + } + + if ( currentType != NO_CHAIN_TYPE ) + { +// LDEBUG << "Current type is " << currentType; + // -------------> + // endroit ou mettre le bloc deplace + // <------------- + if ( (currentSon != last) && + ( data->matrices()-> belongsToMatrix( + dataMap[father], + dataMap[currentSon], + currentType ) ) ) + { +// LDEBUG << father << " -> " << currentSon << " is in the matrix"; + bool canFinish = ( data->matrices()->canChainEndBy(dataMap[currentSon], currentType)); + // bloc ci-dessous a deplacer plus haut pour explorer + // toutes les chaines. Pb: rend le parcours tres tres lourd. + // -------------> + if (!pile.empty() && !pile.back().get<2>().empty()) + { +// LDEBUG << father << " has remaining sons: pushing them to the tank"; +// tank.push_back(std::make_pair(pile, pileSons)); + tank.push_back(pile); + } + // <------------- +// LDEBUG << "Pushing " << currentSon << "(" << canFinish << ")"; + pile.push_back(boost::make_tuple(currentSon, canFinish, std::vector< LinguisticGraphVertex >())); + if (currentSon != stop) + { + std::vector< LinguisticGraphVertex >& sons = pile.back().get<2>(); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(currentSon, *(data->graph())); + for (; it != it_end; it++) + { +// LDEBUG << "Edge is " << *it; +// LDEBUG << "Adding " << target(*it, *(data->graph())) << " to sons of " << currentSon; + sons.push_back(target(*it, *(data->graph()))); + } + } + else + { +// LDEBUG << "Stop reached"; + if (canFinish) + { +// LDEBUG << "currentSon " << currentSon << " is a possible end. Reporting the chain in the graph."; + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,currentSon); + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId, currentSon); + } + else + { +// LDEBUG << "currentSon " << currentSon << " is not a possible end."; +// LDEBUG << "Trying to find a chain end in the stack"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx!=first) { +// LDEBUG << "Chain end is " << lastChainVx << ". Reporting the chain in the graph."; + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx; + for (; it != it_end; it++) + { +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } +// else +// { +// LDEBUG << "NoChainEndInStackException catched"; +// } + } + } + } + else + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << father << " -> " << currentSon << " NOT in the matrix"; + LinguisticGraphVertex lastChainVx = unstackUptoChainEnd(data, pile, currentType); + if (lastChainVx!=first) + { + std::string newChainString = stringChain(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + if (alreadyReported.find(newChainString) == alreadyReported.end()) + { +// LDEBUG << "Reporting chain: " << newChainString; + alreadyReported.insert(newChainString); + reportChainInGraph(data, pile, currentType, alreadyFinished,startChainId,lastChainVx); + LinguisticGraphOutEdgeIt it, it_end; + boost::tie(it, it_end) = out_edges(lastChainVx, *(data->graph())); +// LDEBUG << "Initializing for the sons of " << lastChainVx << " after unstacking"; + for (; it != it_end; it++) + { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + SACLOGINIT; + LERROR << "Analyze too long. Stopped in SyntacticAnalyzerChains"; + return; + } +#endif +// LDEBUG << "Looking at an out edge of the chain's last vertex : " << *it; + LinguisticGraphVertex nextVx = target(*it, *(data->graph())); + if (alreadyFinished.find(nextVx) == alreadyFinished.end()) + { +// LDEBUG << "Adding " << nextVx << " to nextVxs"; + nextVxs.push_back(nextVx); + } + } + } +// else +// { +// LDEBUG << "This chain (" << newChainString << ") has already been found. Nothing to do."; +// } + } + else + { +// LDEBUG << "No end of chain found in pile"; + if (alreadyFinished.find(currentSon) == alreadyFinished.end()) + { + if ( parentsFinished(data, father, alreadyFinished ) ) + { +// LDEBUG << "Adding father " << father << " to alreadyFinished"; + alreadyFinished.insert(father); + } + if (currentSon != last) + { +// LDEBUG << "Adding " << currentSon << " to nextVxs"; + nextVxs.push_back(currentSon); + } + else + { +// LDEBUG << "Adding current son " << currentSon << " to alreadyFinished"; + alreadyFinished.insert(currentSon); + } + } + } + } + } + + if ( (pile.empty() || pile.back().get<2>().empty()) && (! tank.empty()) ) + { +// LDEBUG << "Using a new stack"; +// boost::tie(pile, pileSons) = tank.back(); + pile = tank.back(); + tank.pop_back(); + } + } + } +// LDEBUG << "<========= chains search finished"; +} + +void SyntacticAnalyzerChains::reportChainInGraph( + SyntacticData* data, + const std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type, + std::set< LinguisticGraphVertex >& alreadyFinished, + uint64_t& chainId, + const LinguisticGraphVertex& stop) const +{ +// SACLOGINIT; +// LDEBUG << "SyntacticAnalyzerChains::reportChainInGraph"; + + ChainIdStruct property = ChainIdStruct(type, chainId); + + VertexChainIdPropertyMap vertexChainIdMap = get( vertex_chain_id, *(data->graph()) ); + + std::vector< ChainStackTuple >::const_iterator it, it_end; + it = pile.begin(); it_end = pile.end(); + for (; it != it_end; it++) + { + LinguisticGraphVertex current = (*it).get<0>(); + if ((vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) ) + { + SACLOGINIT; + LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; + return; + } + } + + + std::vector< ChainStackTuple >::const_iterator it_beg, it_last; + it = pile.begin(); it_beg = pile.begin(); + it_end = pile.end(); it_last = --(pile.end()); + std::ostringstream oss; + for (; it != it_end; it++) + { + LinguisticGraphVertex current = (*it).get<0>(); + if (it == it_beg) + { + if (it_beg == it_last) + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); + else + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); + } + else if (it == it_last) + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); + } + else + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); + } + oss << current; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() + && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) + { +// LDEBUG << "executing: vertexChainIdMap[" << current << "].insert(" << property << ")"; + vertexChainIdMap[current].insert(property); + + if (pile.size() > 1) + { + std::vector< ChainStackTuple >::const_iterator it2, it2_end; + it2 = pile.begin(); it2_end = pile.end(); + bool ok = false; + for (; it2 != it2_end; it2++) + { + LinguisticGraphVertex other = (*it2).get<0>(); + if (other != current) + { + LinguisticGraphEdge e; bool found; + boost::tie (e, found) = edge(current, other, *(data->graph())); + if (found) + { + ok = true; + break; + } + else + { + boost::tie(e, found) = edge(other, current, *(data->graph())); + if (found) + { + ok = true; + break; + } + } + } + } + if (!ok) + { + SACLOGINIT; + LWARN << "An edge should exist for " << current << " !"; + } + } + } + else if (vertexChainIdMap[current].size() >= m_maxChainsNbByVertex) + { + SACLOGINIT; + LNOTICE << "Too much chains on " << current << " ; cannot add a new one."; + } + if (current == stop) + break; + else + oss << " "; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex() + && (vertexChainIdMap[current].size() < m_maxChainsNbByVertex) ) + if (parentsFinished(data, current, alreadyFinished)) + { +/* LDEBUG << "Parents of " << current << " are finished ; so it too."; + alreadyFinished.insert(current);*/ + } + } +// LDEBUG << "Chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); + chainId++; + } + +bool SyntacticAnalyzerChains::parentsFinished( + const SyntacticData* data, + const LinguisticGraphVertex& v, + const std::set< LinguisticGraphVertex >& alreadyFinished) const +{ +/* + Critical function : comment logging messages +*/ +// SACLOGINIT; +// LDEBUG << "SyntacticAnalyzerChains::parentsFinished"; + + LinguisticGraphInEdgeIt it, it_end; + boost::tie(it, it_end) = in_edges(v, *(data->graph())); + for (; it != it_end; it++) + { + if (alreadyFinished.find(source(*it, *(data->graph()))) == alreadyFinished.end()) + return false; + } + return true; +} + +std::string SyntacticAnalyzerChains::stringChain( + const SyntacticData* data, + const std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type, + std::set< LinguisticGraphVertex >& alreadyFinished, + uint64_t chainId, + const LinguisticGraphVertex& stop) const +{ +/* + Critical Function : comment logging messages +*/ +// SACLOGINIT; + ChainIdStruct property = ChainIdStruct(type, chainId); + + std::vector< ChainStackTuple >::const_iterator it, it_beg, it_end, it_last; + it = pile.begin(); it_beg = pile.begin(); + it_end = pile.end(); it_last = --(pile.end()); + std::ostringstream oss; + for (; it != it_end; it++) + { + if (it == it_beg) + { + if (it_beg == it_last) + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::UNIGRAM); + else + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::BEGIN); + } + else if (it == it_last) + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::END); + } + else + { + property = ChainIdStruct(type, chainId, LinguisticAnalysisStructure::PART); + } + oss << (*it).get<0>(); + LinguisticGraphVertex current = (*it).get<0>(); + if (current == stop) + break; + else + oss << " "; + if (current != data->iterator()->firstVertex() && current != data->iterator()->lastVertex()) + { + if (pile.size() > 1) + { + std::vector< ChainStackTuple >::const_iterator it2, it2_end; + // @todo replace by lookup only previous and next vertex in pile + it2 = pile.begin(); it2_end = pile.end(); + bool ok = false; + for (; it2 != it2_end; it2++) + { + LinguisticGraphVertex other = (*it2).get<0>(); + if (other != current) + { + LinguisticGraphEdge e; bool found; + boost::tie (e, found) = edge(current, other, *(data->graph())); + if (found) + { + ok = true; + break; + } + else + { + boost::tie (e, found) = edge(other, current, *(data->graph())); + if (found) + { + ok = true; + break; + } + } + } + } + if (!ok) + { + SALOGINIT; + LWARN << "An edge should exist for " << current << " !"; + } + } + } + if ( parentsFinished(data, current, alreadyFinished) ) + { +// LDEBUG << "Adding current " << current << " to alreadyFinished"; + alreadyFinished.insert(current); + } + } +// LDEBUG << "In stringChain, chain " << chainId << " is : " << (type==NOMINAL?"nominal":"verbal") << " " << oss.str(); + return oss.str(); +} + +LinguisticGraphVertex SyntacticAnalyzerChains::unstackUptoChainEnd( + const SyntacticData* data, + std::vector< ChainStackTuple >& pile, + Common::MediaticData::ChainsType type + ) const +{ +/* + Critical function : commeng logging messages +*/ +// SACLOGINIT; +// LDEBUG << "unstackUptoChainEnd " << (type==NOMINAL?"nominal":(type==VERBAL?"verbal":"none")); + CVertexDataPropertyMap dataMap = get( vertex_data, (*data->iterator()->getGraph()) ); + + std::vector< ChainStackTuple >::const_reverse_iterator rit, rit_end; + rit = pile.rbegin(); rit_end = pile.rend(); + for (; rit != rit_end; rit++) + { + if ( data->matrices()->canChainEndBy(dataMap[(*rit).get<0>()], type)) + break; +// LDEBUG << "chain cannot finish by " << (*rit).get<0>(); + } + + if (rit != rit_end) + { + LinguisticGraphVertex newChainEnd = (*rit).get<0>(); +// LDEBUG << "Chain end found in pile: " << newChainEnd; + return (newChainEnd); + } + else + { +// LDEBUG << "No chain end found in pile !"; + return data->iterator()->firstVertex(); + } +} + +} // closing namespace SyntacticAnalysis +} // closing namespace LinguisticProcessing +} // closing namespace Lima diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h index c5edf04de..8b7e5d76f 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-chains.h @@ -118,7 +118,12 @@ class LIMA_SYNTACTICANALYSIS_EXPORT SyntacticAnalyzerChains : public MediaProces void identifyChains(SyntacticData* data, const LinguisticGraphVertex& s, const LinguisticGraphVertex& t, +#ifdef ANTINNO_SPECIFIC + uint64_t& startChainId, StopAnalyze const& stopAnalyze = defaultStopAnalyze) const; +#else uint64_t& startChainId) const; +#endif + /** diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp index 8664bc0a9..92b3da6eb 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-deps.cpp @@ -115,6 +115,10 @@ LimaStatusCode SyntacticAnalyzerDeps::process( SAPLOGINIT; LINFO << "start syntactic analysis - dependence relations search"; +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); if (anagraph==0) { @@ -155,9 +159,9 @@ LimaStatusCode SyntacticAnalyzerDeps::process( { LinguisticGraphVertex beginSentence=boundItr->getFirstVertex(); LinguisticGraphVertex endSentence=boundItr->getLastVertex(); -// LDEBUG << "analyze sentence from vertex " << beginSentence -// << " to vertex " << endSentence; - +#ifdef DEBUG_LP + LDEBUG << "analyze sentence from vertex " << beginSentence << " to vertex " << endSentence; +#endif std::deque< std::string >::const_iterator actionsit, actionsit_end; actionsit = m_actions.begin(); actionsit_end = m_actions.end(); for (; actionsit != actionsit_end; actionsit++) @@ -169,10 +173,14 @@ LimaStatusCode SyntacticAnalyzerDeps::process( } else { -// LDEBUG << "Geting automaton"; +#ifdef DEBUG_LP + LDEBUG << "Geting automaton for action" << action; +#endif Automaton::Recognizer* recognizer = const_cast< Automaton::Recognizer* >((*(m_recognizers.find(action))).second); std::vector result; -// LDEBUG << "Applying automaton for action " << action << " on sentence from " << beginSentence << " to " << endSentence; +#ifdef DEBUG_LP + LDEBUG << "Applying automaton for action " << action << " on sentence from " << beginSentence << " to " << endSentence; +#endif try { recognizer->apply(*anagraph, @@ -186,6 +194,13 @@ LimaStatusCode SyntacticAnalyzerDeps::process( false, // return at first success=false m_applySameRuleWhileSuccess // depends on config file ); +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped SyntacticAnalyzerDeps "; + return TIME_OVERFLOW; + } +#endif } catch (const PhoenixGraphHomoDepsVisitor::StartFinishedException& e) {} } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp index b3cc8b66c..70282282b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntacticAnalyzer-simplify.cpp @@ -66,6 +66,8 @@ void SyntacticAnalyzerSimplify::init( Manager* manager) { + SASLOGINIT; + LINFO << "SyntacticAnalyzerSimplify::init"; m_language=manager->getInitializationParameters().media; std::string rules=unitConfiguration.getParamsValueAtKey("simplifyAutomaton"); m_recognizer = static_cast(LinguisticResources::single().getResource(m_language,rules)); @@ -76,7 +78,7 @@ LimaStatusCode SyntacticAnalyzerSimplify::process( { Lima::TimeUtilsController timer("SyntacticAnalysis"); SASLOGINIT; - LINFO << "start syntactic analysis - subsentences simplification"; + LINFO << "SyntacticAnalyzerSimplify::process"; AnalysisGraph* anagraph=static_cast(analysis.getData("PosGraph")); if (anagraph==0) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp index 5ee1a0695..983c80596 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/SyntacticAnalysis/SyntagmaticMatrix.cpp @@ -29,6 +29,7 @@ #include "SyntagmaticMatrix.h" #include "XmlSyntagmaticMatrixFileHandler.h" #include "common/AbstractFactoryPattern/SimpleFactory.h" +#include "common/tools/FileUtils.h" #include @@ -77,9 +78,8 @@ void SyntagmDefStruct::init( m_verbalMatrix.language(m_language); try { - std::string resourcePath=Common::MediaticData::MediaticData::single().getResourcesPath(); - std::string matricesFileName=resourcePath + "/" + unitConfiguration.getParamsValueAtKey("file"); - loadFromFile(matricesFileName); + QString matricesFileName = findFileInPaths(Common::MediaticData::MediaticData::single().getResourcesPath().c_str(),unitConfiguration.getParamsValueAtKey("file").c_str()); + loadFromFile(matricesFileName.toUtf8().constData()); } catch (Common::XMLConfigurationFiles::NoSuchParam& ) { diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp index 8cd729c12..4f3127d2c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SegmentationResultsLoader.cpp @@ -109,7 +109,11 @@ LimaStatusCode SegmentationResultsLoader::process(AnalysisContent& analysis) con SegmentationResultsLoader::XMLHandler handler(segmData,graph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); +#ifdef ANTINNO_SPECIFIC QFile file(getInputFile(analysis).c_str()); +#else + QFile file(getInputFile(analysis)); +#endif if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp index 017a060d8..e92f1ce1b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/TextSegmentation/SentenceBoundariesFinder.cpp @@ -128,6 +128,10 @@ LimaStatusCode SentenceBoundariesFinder::process( return MISSING_DATA; } +#ifdef ANTINNO_SPECIFIC + auto const& stopAnalyze = analysis.stopAnalyze(); +#endif + LinguisticGraphVertex lastVx=anagraph->lastVertex(); LinguisticGraphVertex beginSentence=anagraph->firstVertex(); #ifdef DEBUG_LP @@ -140,6 +144,13 @@ LimaStatusCode SentenceBoundariesFinder::process( if (m_boundaryValues.empty()) { while (beginSentence!=lastVx) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif LinguisticGraphVertex endSentence=anagraph->nextMainPathVertex(beginSentence,*m_microAccessor,m_boundaryMicros,lastVx); #ifdef DEBUG_LP LDEBUG << "found endSentence at " << endSentence; @@ -154,6 +165,13 @@ LimaStatusCode SentenceBoundariesFinder::process( LinguisticGraphVertex endSentence=anagraph->nextMainPathVertex(beginSentence,*m_microAccessor,m_boundaryMicros,lastVx); while (endSentence!=lastVx) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif Token* t=get(vertex_token,*(anagraph->getGraph()),endSentence); #ifdef DEBUG_LP if (t!=0) { @@ -165,6 +183,13 @@ LimaStatusCode SentenceBoundariesFinder::process( } #endif if (t==0 || m_boundaryValues.find(t->stringForm())!=m_boundaryValues.end()) { +#ifdef ANTINNO_SPECIFIC + if (stopAnalyze) + { + LERROR << "Analyze too long. Stopped in SentenceBoundariesFinder"; + return TIME_OVERFLOW; + } +#endif sb->add(Segment("sentence",beginSentence,endSentence,anagraph)); beginSentence=endSentence; } diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp index b7059aa12..38d893011 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/WordSenseAnalysis/Test.cpp @@ -23,7 +23,7 @@ int main(int argc,char* argv[]) std::string lineString; size_t linenum(0); - getline(ifl, lineString); + lineString = Lima::Common::Misc::readLine(ifl); while (ifl.good() && !ifl.eof()) { Common::Misc::chomp(lineString); diff --git a/lima_linguisticprocessing/test/CMakeLists.txt b/lima_linguisticprocessing/test/CMakeLists.txt index b67fe0456..32e11f6dc 100644 --- a/lima_linguisticprocessing/test/CMakeLists.txt +++ b/lima_linguisticprocessing/test/CMakeLists.txt @@ -62,6 +62,20 @@ target_link_libraries(analyzeText install(TARGETS analyzeText DESTINATION bin) +########### next target ############### +if (${PYTHONLIBS_FOUND}) + SET(srl_SRCS + srl.cpp + ) + + add_executable(srl ${srl_SRCS}) + target_link_libraries(srl + ${PYTHON_LIBRARY} + ${QT_LIBRARIES} + ) + + install(TARGETS srl DESTINATION bin) +endif() ########### next target ############### # SET(threadedAnalyzeText_SRCS diff --git a/lima_linguisticprocessing/test/analyzeText.cpp b/lima_linguisticprocessing/test/analyzeText.cpp index afb7772f5..95e7fb58e 100644 --- a/lima_linguisticprocessing/test/analyzeText.cpp +++ b/lima_linguisticprocessing/test/analyzeText.cpp @@ -30,6 +30,7 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/Data/strwstrtools.h" #include "common/time/traceUtils.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLog.h" #include "common/QsLog/QsLogDest.h" #include "common/QsLog/QsLogCategories.h" @@ -41,6 +42,7 @@ #include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" #include "linguisticProcessing/client/AnalysisHandlers/BowTextHandler.h" #include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" +#include "linguisticProcessing/client/AnalysisHandlers/LTRTextHandler.h" #include "linguisticProcessing/core/EventAnalysis/EventHandler.h" #include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" #include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" @@ -91,17 +93,22 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); - // Necessary to initialize factories under Windows + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + QsLogging::initQsLog(configPath); + // Necessary to initialize factories Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); // std::cerr << "Amose plugins initialized" << std::endl; - std::string resourcesPath; - std::string configDir; + std::string strResourcesPath; std::string lpConfigFile; std::string commonConfigFile; std::string clientId; - std::string cesartOutput; std::vector languages; std::vector dumpersv; std::vector outputsv; @@ -110,6 +117,7 @@ int run(int argc,char** argv) std::vector vinactiveUnits; std::string meta; std::string splitMode; + std::string strConfigPath; po::options_description desc("Usage"); @@ -123,9 +131,9 @@ int run(int argc,char** argv) "where to write dumpers output. By default, each dumper writes its results on a file whose name is the input file with a predefined suffix appended. This option allows to chose another suffix or to write on standard output. Its syntax is the following: : with a dumper name and destination, either the value 'stdout' or a suffix.") ("mm-core-client", po::value(&clientId)->default_value("lima-coreclient"), "Set the linguistic processing client to use") - ("resources-dir", po::value(&resourcesPath)->default_value(qgetenv("LIMA_RESOURCES").constData()==0?"":qgetenv("LIMA_RESOURCES").constData(),"$LIMA_RESOURCES"), + ("resources-dir", po::value(&strResourcesPath), "Set the directory containing the LIMA linguistic resources") - ("config-dir", po::value(&configDir)->default_value(qgetenv("LIMA_CONF").constData()==0?"":qgetenv("LIMA_CONF").constData(),"$LIMA_CONF"), + ("config-dir", po::value(&strConfigPath), "Set the directory containing the (LIMA) configuration files") ("common-config-file", po::value(&commonConfigFile)->default_value("lima-common.xml"), "Set the LIMA common libraries configuration file to use") @@ -158,13 +166,15 @@ int run(int argc,char** argv) std::cout << desc << std::endl; return SUCCESS_ID; } - if (resourcesPath.empty()) + if (!strResourcesPath.empty()) { - resourcesPath = "/usr/share/apps/lima/resources/"; + resourcesPath = QString::fromUtf8(strResourcesPath.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); } - if (configDir.empty()) + if (!strConfigPath.empty()) { - configDir = "/usr/share/config/lima/"; + configPath = QString::fromUtf8(strConfigPath.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); } std::deque langs(languages.size()); std::copy(languages.begin(), languages.end(), langs.begin()); @@ -242,24 +252,36 @@ int run(int argc,char** argv) uint64_t beginTime=TimeUtils::getCurrentTime(); - AbstractLinguisticProcessingClient* client(0); - // initialize common Common::MediaticData::MediaticData::changeable().init( - resourcesPath, - configDir, + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); - // initialize linguistic processing - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); - LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); + bool clientFactoryConfigured = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + lpConfigFile.c_str()).exists()) + { + // initialize linguistic processing + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig((configDir + "/" + lpConfigFile.c_str()).toStdString()); + LinguisticProcessingClientFactory::changeable().configureClientFactory( + clientId, + lpconfig, + langs, + pipelines); + clientFactoryConfigured = true; + break; + } + } + if(!clientFactoryConfigured) + { +// std::cerr << "No LinguisticProcessingClientFactory were configured with" << configDirs.join(LIMA_PATH_SEPARATOR).toStdString() << "and" << lpConfigFile << std::endl; + return EXIT_FAILURE; + } - client=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + std::shared_ptr< AbstractLinguisticProcessingClient > client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -268,6 +290,7 @@ int run(int argc,char** argv) BowTextHandler* bowTextHandler = 0; SimpleStreamHandler* simpleStreamHandler = 0; SimpleStreamHandler* fullXmlSimpleStreamHandler = 0; + LTRTextHandler* ltrTextHandler=0; if (dumpers.find("event") != dumpers.end()) { @@ -294,6 +317,11 @@ int run(int argc,char** argv) fullXmlSimpleStreamHandler = new SimpleStreamHandler(); handlers.insert(std::make_pair("fullXmlSimpleStreamHandler", fullXmlSimpleStreamHandler)); } + if (dumpers.find("ltr") != dumpers.end()) + { + ltrTextHandler= new LTRTextHandler(); + handlers.insert(std::make_pair("ltrTextHandler", ltrTextHandler)); + } std::map metaData; @@ -342,7 +370,7 @@ int run(int argc,char** argv) if (splitMode == "lines") { int lineNum = 0, nbLines = 0; - std::cerr << "Counting number of lines…"; + std::cerr << "Counting number of lines…"; while (!file.atEnd()) { file.readLine(); @@ -388,22 +416,28 @@ int run(int argc,char** argv) closeHandlerOutputFile(fullxmlofs); } std::cout << std::endl; - delete client; // free handlers if (eventHandler != 0) delete eventHandler; if (bowTextWriter!= 0) delete bowTextWriter; - if (bowTextHandler!= 0) - delete bowTextHandler; if (simpleStreamHandler!= 0) delete simpleStreamHandler; if (fullXmlSimpleStreamHandler!= 0) delete fullXmlSimpleStreamHandler; - delete Common::MediaticData::MediaticData::pchangeable(); - delete LinguisticProcessingClientFactory::pchangeable(); + if (bowTextHandler!= 0) { + // not handled in output file: just print on output (this should just be used for testing) + std::cout << bowTextHandler->getBowText(); + delete bowTextHandler; + } + if (ltrTextHandler!= 0) { + // not handled in output file: just print on output (this should just be used for testing) + std::cout << ltrTextHandler->getLTRText(); + delete ltrTextHandler; + } TIMELOGINIT; LINFO << "Total: " << TimeUtils::diffTime(beginTime,TimeUtils::getCurrentTime()) << " ms"; + TimeUtils::logAllCumulatedTime("et finalement..."); return SUCCESS_ID; } diff --git a/lima_linguisticprocessing/test/analyzetextservercore.cpp b/lima_linguisticprocessing/test/analyzetextservercore.cpp index 9b5f90464..a938ab02a 100644 --- a/lima_linguisticprocessing/test/analyzetextservercore.cpp +++ b/lima_linguisticprocessing/test/analyzetextservercore.cpp @@ -139,10 +139,9 @@ int run(int argc,char** argv) uint64_t beginTime=TimeUtils::getCurrentTime(); - AbstractLinguisticProcessingClient* client(0); - std::map handlers; + std::shared_ptr< AbstractLinguisticProcessingClient > client; try { // initialize common @@ -160,7 +159,7 @@ int run(int argc,char** argv) langs, pipelines); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); } catch (InvalidConfiguration& e) { @@ -366,7 +365,6 @@ int run(int argc,char** argv) std::cout << "ERROR: unknown error." << std::endl; } } - delete client; TIMELOGINIT; LINFO << "Total: " << TimeUtils::diffTime(beginTime,TimeUtils::getCurrentTime()) << " ms"; diff --git a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp index 1734d4227..f40961d9f 100644 --- a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp +++ b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include // std::stringstream @@ -53,19 +52,20 @@ class AnalysisWrapperPrivate { friend class AnalysisWrapper; public: - AnalysisWrapperPrivate (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, + AnalysisWrapperPrivate (std::shared_ptr< AbstractLinguisticProcessingClient > analyzer, const std::set& langs); ~AnalysisWrapperPrivate() {} std::ostream* openHandlerOutputFile(AbstractTextualAnalysisHandler* handler, const std::string& fileName, const std::set&dumpers, const std::string& dumperId); void closeHandlerOutputFile(std::ostream* ofs); - boost::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; const std::set& m_langs; }; -AnalysisWrapperPrivate::AnalysisWrapperPrivate(Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, - const std::set& langs) : +AnalysisWrapperPrivate::AnalysisWrapperPrivate( + std::shared_ptr< AbstractLinguisticProcessingClient > analyzer, + const std::set& langs) : m_analyzer(analyzer), m_langs(langs) { @@ -106,7 +106,7 @@ void AnalysisWrapperPrivate::closeHandlerOutputFile(std::ostream* ofs) } -AnalysisWrapper::AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* analyzer, +AnalysisWrapper::AnalysisWrapper (std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > analyzer, const std::set& langs, QObject* parent ): QObject(parent), m_d(new AnalysisWrapperPrivate(analyzer,langs)) @@ -117,8 +117,6 @@ AnalysisWrapper::AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguistic AnalysisWrapper::~AnalysisWrapper() { - CORECLIENTLOGINIT; - LDEBUG << "AnalysisWrapper::~AnalysisWrapper"; delete m_d; } diff --git a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h index 28949f638..9230e9f54 100644 --- a/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h +++ b/lima_linguisticprocessing/test/limaServer/AnalysisWrapper.h @@ -22,6 +22,7 @@ #define ANALYSISWRAPPER_H #include +#include #include #include @@ -45,8 +46,9 @@ class AnalysisWrapper : public QObject { Q_OBJECT public: - AnalysisWrapper (Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* m_analyzer, - const std::set& langs, QObject* parent = 0 ); + AnalysisWrapper ( + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > analyzer, + const std::set& langs, QObject* parent = 0 ); virtual ~AnalysisWrapper(); QString analyze(const QString& text, const QString& language, const QString& pipeline); diff --git a/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp b/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp index d9eb99972..7d6f0fab1 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp +++ b/lima_linguisticprocessing/test/limaServer/LimaDBusServer.cpp @@ -93,7 +93,7 @@ LimaDBusServerPrivate::LimaDBusServerPrivate( const std::string& configDir, pipelines); LDEBUG << "LimaDBusServer::LimaDBusServer: createClient..."; - m_analyzer=new AnalysisWrapper(static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)),m_langs,p); + m_analyzer=new AnalysisWrapper(std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)),m_langs,p); } diff --git a/lima_linguisticprocessing/test/limaServer/LimaServer.cpp b/lima_linguisticprocessing/test/limaServer/LimaServer.cpp index af43b4191..97c680e7b 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaServer.cpp +++ b/lima_linguisticprocessing/test/limaServer/LimaServer.cpp @@ -122,7 +122,7 @@ LimaServer::LimaServer( const std::string& configDir, pipelines); LDEBUG << "LimaServer::LimaServer: createClient..."; - m_analyzer=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + m_analyzer = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); LDEBUG << "LimaServer::LimaServer: create QHttpServer..."; m_server = new QHttpServer(this); @@ -137,17 +137,6 @@ LimaServer::LimaServer( const std::string& configDir, LimaServer::~LimaServer() { - CORECLIENTLOGINIT; - LINFO << "LimaServer::~LimaServer"; - // free client - LINFO << "LimaServer::~LimaServer: httpserver deleted!"; - delete m_analyzer; - LINFO << "LimaServer::~LimaServer: m_analyzer deleted"; - // free MediaticData ??? - delete Common::MediaticData::MediaticData::pchangeable(); - LINFO << "LimaServer::~LimaServer: mediaticData deleted"; - // free linguistic processing ressources - delete LinguisticProcessingClientFactory::pchangeable(); } void LimaServer::quit() { @@ -166,7 +155,7 @@ void LimaServer::handleRequest(QHttpRequest *req, QHttpResponse *resp) CORECLIENTLOGINIT; req->storeBody(); LDEBUG << "LimaServer::handleRequest: create AnalysisThread..."; - AnalysisThread *thread = new AnalysisThread(m_analyzer, req, resp, m_langs, this ); + AnalysisThread *thread = new AnalysisThread(m_analyzer.get(), req, resp, m_langs, this ); connect(req,SIGNAL(end()),thread,SLOT(startAnalysis())); connect(thread, SIGNAL(finished()), thread, SLOT(deleteLater())); thread->start(); diff --git a/lima_linguisticprocessing/test/limaServer/LimaServer.h b/lima_linguisticprocessing/test/limaServer/LimaServer.h index 27575b189..01484c5df 100644 --- a/lima_linguisticprocessing/test/limaServer/LimaServer.h +++ b/lima_linguisticprocessing/test/limaServer/LimaServer.h @@ -47,6 +47,7 @@ #include #include #include +#include class QTimer; @@ -80,7 +81,7 @@ private Q_SLOTS: QTimer* m_timer; - Lima::LinguisticProcessing::AbstractLinguisticProcessingClient* m_analyzer; + std::shared_ptr< Lima::LinguisticProcessing::AbstractLinguisticProcessingClient > m_analyzer; }; #endif diff --git a/lima_linguisticprocessing/test/limaServer/analysisthread.cpp b/lima_linguisticprocessing/test/limaServer/analysisthread.cpp index f1790c6fb..b695242a7 100644 --- a/lima_linguisticprocessing/test/limaServer/analysisthread.cpp +++ b/lima_linguisticprocessing/test/limaServer/analysisthread.cpp @@ -86,8 +86,6 @@ AnalysisThread::AnalysisThread (Lima::LinguisticProcessing::AbstractLinguisticPr AnalysisThread::~AnalysisThread() { - CORECLIENTLOGINIT; - LDEBUG << "AnalysisThread::~AnalysisThread"; delete m_d; } diff --git a/lima_linguisticprocessing/test/limaServer/main.cpp b/lima_linguisticprocessing/test/limaServer/main.cpp index 1bbe076a6..4bed11d8b 100644 --- a/lima_linguisticprocessing/test/limaServer/main.cpp +++ b/lima_linguisticprocessing/test/limaServer/main.cpp @@ -25,6 +25,7 @@ #include #include "common/LimaCommon.h" +#include "common/tools/FileUtils.h" #include "common/QsLog/QsLogCategories.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" @@ -43,9 +44,13 @@ namespace po = boost::program_options; int main(int argc, char **argv) { + QStringList configDirs = Misc::buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + QCoreApplication app(argc, argv); - QsLogging::initQsLog(); + QsLogging::initQsLog(configPath); Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); std::cerr << "Amose plugins initialized" << std::endl; QsLogging::initQsLog(); diff --git a/lima_linguisticprocessing/test/srl.cpp b/lima_linguisticprocessing/test/srl.cpp new file mode 100644 index 000000000..a00097fd1 --- /dev/null +++ b/lima_linguisticprocessing/test/srl.cpp @@ -0,0 +1,125 @@ +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include +#include +#include + +#if PY_MAJOR_VERSION < 3 +#error "Python version must be 3 or more" +#endif + +int main(int argc, char **argv) +{ +std::string text = "1 The the DET DT _ _ 2 NMOD _ _\n" +"2 Convention Convention NP NNP _ _ 4 SUB _ _\n" +"3 also also ADV RB _ _ 4 VMOD _ _\n" +"4 established establish V VBD _ _ _ _ _ _\n" +"5 eleven eleven NOMBRE CD Numex.NUMBER _ _ _ _ _\n" +"6 Working working ADJ JJ _ _ 7 NMOD _ _\n" +"7 Groups group NC NNS _ _ 4 OBJ _ _\n" +"8 and and CONJ CC _ _ _ _ _ _\n" +"9 three three NOMBRE CD Numex.NUMBER _ _ _ _ _\n" +"10 Discussion discussion NC NN _ _ 11 NMOD _ _\n" +"11 Circles circle NC NNS _ _ _ DEP _ _"; + +/* + * Find the first python executable in the path and use it as the program name. + * + * This allows to find the modules set up in an activated virtualenv + */ + QString str_program_name; + QString pathEnv = QString::fromUtf8(qgetenv("PATH").constData()); + for (const auto & path: pathEnv.split(QRegExp("[;:]"))) + { + if (QFile::exists(path + "/python" )) + { + str_program_name = path + "/python"; + break; + } + } +#ifndef WIN32 + Py_SetProgramName(const_cast( str_program_name.toStdWString().c_str())); +#else + Py_SetProgramName( (wchar_t*)str_program_name.unicode() ); +#endif + + Py_Initialize(); + + PyObject* main_module = PyImport_ImportModule("__main__"); + PyObject* main_dict = PyModule_GetDict(main_module); + PyObject* sys_module = PyImport_ImportModule("sys"); + if (sys_module == NULL) + { + std::cerr << "Failed to import the sys module" << std::endl; + PyErr_Print(); + } + PyObject* sys_dict = PyModule_GetDict(sys_module); + PyDict_SetItemString(main_dict, "sys", sys_module); + + // Add the path to the knowledgesrl pachkage to putho path + PyObject* pythonpath = PySys_GetObject("path"); + if (PyList_Append(pythonpath, PyUnicode_DecodeFSDefault("/home/gael/Projets/knowledgesrl/src")) == -1) + { + std::cerr << "Failed to append to python path" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Import the semanticrolelabeler module + PyObject* semanticrolelabeler_module = PyImport_ImportModule("semanticrolelabeler"); + if (semanticrolelabeler_module == NULL) + { + std::cerr << "Failed to import srl semanticrolelabeler module" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Create the semantic role labeller instance + PyObject* instance = PyObject_CallMethod(semanticrolelabeler_module, "SemanticRoleLabeler", "[s]", "--log=debug"); + if (instance == NULL) + { + std::cerr << "Cannot instantiate the SemanticRoleLabeler python class" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Run the semantic role labeller + PyObject* callResult = PyObject_CallMethod(instance, "annotate", "s", text.c_str()); + if (callResult == NULL) + { + std::cerr << "Failed to call the annotate method" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + + // Display the SRL result + char* result = PyUnicode_AsUTF8(callResult); + if (result == NULL) + { + std::cerr << "Cannot convert result item to string" << std::endl; + PyErr_Print(); + Py_Exit(1); + } + std::cout << "Python result is:" << std::endl << result; + Py_Finalize(); + + return 0; + +} diff --git a/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp b/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp index f7e019dd3..10091cde9 100644 --- a/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp +++ b/lima_linguisticprocessing/tools/applyAutomaton/apply-rules.cpp @@ -225,12 +225,12 @@ void readCommandLineArguments(uint64_t argc, char *argv[]) } } if (param.resourcesPath.empty()) { - char* resourcesStr = getenv("LIMA_RESOURCES"); + const char* resourcesStr = qgetenv("LIMA_RESOURCES").constData(); if (resourcesStr != NULL) { param.resourcesPath = resourcesStr; } else { cerr << "$LIMA_RESOURCES not defined" << endl; exit(1); } } if (param.configDir.empty()) { - char* configStr = getenv("LIMA_CONF"); + const char* configStr = qgetenv("LIMA_CONF").constData(); if (configStr != NULL) { param.configDir = configStr; } else { cerr << "$LIMA_CONF not defined" << endl; exit(1); } } @@ -240,8 +240,7 @@ void readCommandLineArguments(uint64_t argc, char *argv[]) // local getline void localGetline(ifstream& file, LimaString& line) { - string str; - getline(file,str); + string str = Lima::Common::Misc::readLine(file); if (param.encoding=="latin1") { line = Misc::latin15stdstring2limastring(str); } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp b/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp index 719562300..41cff255e 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/compile-rules.cpp @@ -1,540 +1,615 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ -/****************************************************************************** -* -* File : compile-rules.cpp -* Author : Besancon Romaric (besanconr@zoe.cea.fr) -* Created on : Fri Oct 25 2002 -* Copyright : (c) 2002 by CEA -* Version : $Id$ -* -******************************************************************************/ - - -#ifdef HAVE_CONFIG_H -#include -#endif - -#include "compilerExceptions.h" -#include "libautomatonCompiler/recognizerCompiler.h" - -#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" -#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" -#include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" -#include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" -#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" -// #include "common/AbstractFactoryPattern/MainFactory.h" -#include "common/tools/LimaMainTaskRunner.h" -#include "common/MediaProcessors/MediaProcessUnit.h" -#include "common/MediaProcessors/MediaAnalysisDumper.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include "common/time/timeUtilsController.h" - -#include "linguisticProcessing/core/Automaton/recognizer.h" -#include "linguisticProcessing/core/Automaton/automatonReaderWriter.h" -#include "linguisticProcessing/core/Automaton/automatonCommon.h" // for exceptions -#include "common/LimaCommon.h" -#include "common/MediaticData/mediaticData.h" -#include -#include -#include -#include -#include - -#include - - -using namespace std; - -using namespace Lima; -using namespace Lima::Common::XMLConfigurationFiles; -using namespace Lima::LinguisticProcessing::Automaton; -using namespace Lima::LinguisticProcessing; -using namespace Lima::Common::MediaticData; -using namespace Lima::Common::Misc; - -//**************************************************************************** -// declarations -//**************************************************************************** -// help mode & usage -static const string USAGE("usage : compile-rules [-h] -ooutputfile rulesfile\n"); - -static const string HELP("A compiler for the rules of the Named Entities recognizer\n" - +USAGE - +"\n" -+"-h : this help page\n" -+"--output=file : name of the output file for the compiled rules\n" -// +"(or -ofile)\n" -+"\n" -+"--language=... : specify the language of the recognizer\n" -+"--modex=... : specify the name of the modex config file\n" -+"--pipeline=... : specify the name of the pipeline for the modex\n" -+"--configDir=... : specify the directory to find the config files (default is $LIMA_CONF)\n" -+"--resourcesDir=... : specify the directory to find the resources (default is $LIMA_RESOURCES)\n" -+"--common-config-file=... : = Optional. Default is lima-common.xml\n" -+"--lp-config-file=... : = Optional. Default is lima-analysis.xml\n" -+"--encoding=... : specify the encoding of the rules file\n" -+"--useDictionary : uses a dictionary to reorganize rules\n" -+"--debug : compiles in debug mode\n" -+"\n" -+"--listTriggers : list the triggers with the corresponding offest\n" -+"--bin (or -r) : read a binary file containing compiled rules : if \n" -+" the --listTriggers is not set, print the rules on stdout\n" -+"\n" -+"rulesfile is the name of the file containing the rules in plain text\n"); - -//**************************************************************************** -#define DEFAULT_COMMON_CONFIG "lima-common.xml" -#define DEFAULT_LP_CONFIG "lima-analysis.xml" -#define DEFAULT_ENCODING "utf8" - -//**************************************************************************** -// GLOBAL variable -> the command line arguments -struct Param -{ - string inputRulesFile; // name of the rules file - string outputFile; // name of the output file for the compiled rules - string resourcesDir; // directory for resources - string configDir; // directory for config files - string commonConfigFile; // config file for linguisticData - string lpConfigFile; // config file for linguistic processing - string modexConfigFile; // config file for modex - string pipeline; // pipeline for modex (defined in config file) - string language; // language of the files - string encoding; // default encoding of rules files - bool decompile; // reads compiled rules - bool listTriggers; // list the triggers with their associated index - bool useDictionary; // use a dictionary to reorganize rules - bool loadPossibleTypes;// force loading of possible types - bool debug; // compile in debug mode (store rule ids for debug purposes) - bool help; // help mode -} -param={"", - "", - "", - "", - DEFAULT_COMMON_CONFIG, - DEFAULT_LP_CONFIG, - "", - "", - "", - DEFAULT_ENCODING, - false, - false, - false, - false, - false, - false}; - -void readCommandLineArguments(uint64_t argc, char *argv[]) -{ -// bool languageSpecified(false); - for(uint64_t i(1); i= argc) - { - std::cerr << "no output filename given" << endl; - cerr << USAGE << endl; - exit(1); - } - else - { - param.outputFile = argv[i]; - } - } - } - else if (s.find("--output=",0)==0) - { - param.outputFile=s.substr(9,s.length()-9); - } - else if (s.find("--modex=",0)==0) - { - param.modexConfigFile=string(s,8); - } - else if (s.find("--pipeline=",0)==0) - { - param.pipeline=string(s,11); - } - else if (s.find("--debug",0)==0) - { - param.debug=true; - } - else if (s[0]=='-') - { - std::cerr << "unrecognized option " << s << endl; - cerr << USAGE << endl; - exit(1); - } - else - { // file names - param.inputRulesFile=s; - } - } - // if not specified, search default values in environment variables - if (param.resourcesDir.empty()) - { - char* resourcesStr = getenv("LIMA_RESOURCES"); - if (resourcesStr != NULL) - { - param.resourcesDir = resourcesStr; - } - else - { - param.resourcesDir = "/usr/share/apps/lima/resources/"; - } - } - if (param.configDir.empty()) - { - char* configStr = getenv("LIMA_CONF"); - if (configStr != NULL) - { - param.configDir = configStr; - } - else - { - param.configDir = "/usr/share/config/lima/"; - } - } - - //ensure all needed parameters are set - if (param.language.empty()) { - cerr << "Error: missing --language=.. argument " << endl; - exit(1); - } -// if (param.modexConfigFile.empty()) { -// cerr << "Error: missing --modex=.. argument " << endl; -// exit(1); -// } - -} - -std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, const std::string& pipeline); - -//**************************************************************************** -// M A I N -//**************************************************************************** -#include "common/tools/LimaMainTaskRunner.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include - -int run(int aargc,char** aargv); - -int main(int argc, char **argv) -{ - QCoreApplication a(argc, argv); - - // Task parented to the application so that it - // will be deleted by the application. - Lima::LimaMainTaskRunner* task = new Lima::LimaMainTaskRunner(argc, argv, run, &a); - - // This will cause the application to exit when - // the task signals finished. - QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); - - // This will run the task from the application event loop. - QTimer::singleShot(0, task, SLOT(run())); - - return a.exec(); - -} - - -int run(int argc,char** argv) -{ - QsLogging::initQsLog(); - //Lima::TimeUtilsController("run", true); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - - readCommandLineArguments(argc,argv); - - deque langs; - langs.push_back(param.language); - - // initialize linguisticData -// try - { - // initialize common - LOGINIT("Automaton::Compiler"); - LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ")..."; - MediaticData::changeable().init( - param.resourcesDir, - param.configDir, - param.commonConfigFile, - langs); - LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ") done!"; - - /* - * @TODO eviter l'initialisation des ressources dans compiles rules - * On est oblige d'initialiser les ressources, juste pour recuperer un - * Recognizer vide. Il faut pouvoir creer un Recognizer dans avoir a - * initialiser les ressources - */ - - // initialize linguistic processing resources - MediaId language = MediaticData::single().media(param.language); - - XMLConfigurationFileParser lpconfig(param.configDir + "/" + param.lpConfigFile); - const string& langConfigFile=lpconfig.getModuleGroupParamValue("lima-coreclient","mediaProcessingDefinitionFiles",param.language); - XMLConfigurationFileParser langParser(param.configDir + "/" + langConfigFile); - ModuleConfigurationStructure& module=langParser.getModuleConfiguration("Resources"); - LinguisticResources::changeable().initLanguage( - language, - module, - false); // don't load mainkeys in stringpool, no use - - AbstractResource* resReco = LinguisticResources::single().getResource(language,"automatonCompiler"); - - Recognizer& reco = *(static_cast< Recognizer* >(resReco)); - - // look at the modex config file to find the dynamic libraries that must be loaded - if (! param.modexConfigFile.empty()) { - LOGINIT("Automaton::Compiler"); - LDEBUG << "use modex file " << param.modexConfigFile; - XMLConfigurationFileParser modexconfig(param.configDir + "/" + param.modexConfigFile); - vector libraries=getDynamicLibraryNames(modexconfig,param.pipeline); - for (vector::const_iterator it=libraries.begin(),it_end=libraries.end();it!=it_end; it++) - { - LOGINIT("Automaton::Compiler"); - LDEBUG << "load library " << *it; - Common::DynamicLibrariesManager::changeable().loadLibrary(*it); - } - } - - //Recognizer reco; - // if the rules file is in binary format and we want to print its content - if (param.decompile) - { - try - { - //reco.readFromFile(param.inputRulesFile); - AutomatonReader reader; - reader.readRecognizer(param.inputRulesFile,reco); - - if (! param.listTriggers) - { - cout << reco; - } - } - catch (exception& e) - { - std::cerr << "Error while reading rules file: " << e.what() << endl; - exit(1); - } - } - else - { - // read the rules file in text format - //try - { - // Lima::TimeUtilsController *ctrl2 = new Lima::TimeUtilsController("read file and build recognizer", true); - // Lima::TimeUtilsController("read file and build recognizer", true); - std::cerr << "\rBuilding recognizer…"; - RecognizerCompiler::setRecognizerEncoding(param.encoding); - RecognizerCompiler compiler(param.inputRulesFile); - compiler.buildRecognizer(reco,language); - // delete ctrl2; - } - /*catch (exception& e) - { - std::cerr << "recognizer construction failed:"<< e.what() << endl; - exit(1); - }*/ - - // if we want to use a dictionary to reorganize rules - if (param.useDictionary) - { - // Lima::TimeUtilsController("useDictionary", true); - try - { - - string dicostr = "mainDictionary"; - AbstractResource* res= LinguisticResources::single().getResource(language,dicostr); - - AnalysisDict::AbstractAnalysisDictionary* dico = static_cast< AnalysisDict::AbstractAnalysisDictionary* >(res); - if (dico==0) - { - throw runtime_error("dictionary not available for language "+ - param.language); - } - // Reorganization not available - // reco.reorganizeRules(*dico); - } - // when character is searched out of text buffer - catch (std::exception& e) { - std::cerr << "Error: " << e.what() << endl; - } - } - - // write recognizer to file - try - { - if (! param.outputFile.empty()) - { - std::cerr << "\rWriting recognizer…"; - AutomatonWriter writer; - writer.writeRecognizer(reco,param.outputFile,language,param.debug); - //reco.writeToFile(param.outputFile); - } - } - catch (Lima::LinguisticProcessing::Automaton::OpenFileException& e) - { - std::cerr << "OpenFileException: " << e.what() << endl; exit(1); - } - } - - if (param.listTriggers) - { - reco.listTriggers(); - } - - } -// catch (InvalidConfiguration& e) -// { -// std::cerr << "Caught InvalidConfiguration: " << e.what() << std::endl; -// throw e; -// } -// catch (NoSuchModule &) -// { -// std::cerr << e.what() << std::endl; -// } -// catch (NoSuchGroup& e) -// { -// std::cerr << e.what() << std::endl; -// } -// catch (NoSuchParam& ) -// { -// std::cerr << e.what() << std::endl; -// } - TIMELOGINIT; - TimeUtils::logAllCumulatedTime("And at last"); - - - return EXIT_SUCCESS; -} - -//----------------------------------------------------------------------------------------------- -//----------------------------------------------------------------------------------------------- -void addLibs(GroupConfigurationStructure& group, - std::vector& libNames) -{ - try { - std::string libs=group.getAttribute("lib"); - std::string::size_type begin=0; - std::string::size_type i=libs.find(",",begin); - while (i!=std::string::npos) { - libNames.push_back(string(libs,begin,i-begin)); - begin=i+1; - i=libs.find(",",begin); - } - libNames.push_back(string(libs,begin)); - } - catch (NoSuchAttribute& ) {} // do nothing: optional -} - -std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, - const std::string& pipeline) -{ - vector libNames; - try { - ModuleConfigurationStructure& module=parser.getModuleConfiguration("Processors"); - - if (! pipeline.empty()) { - // search libs for given pipeline - try { - GroupConfigurationStructure group=module.getGroupNamed(pipeline); - addLibs(group,libNames); - // do it for all groups included in pipeline - deque& processUnits=group.getListsValueAtKey("processUnitSequence"); - for (deque::const_iterator it=processUnits.begin(),it_end=processUnits.end(); it!=it_end; it++) - { - try { - GroupConfigurationStructure pu=module.getGroupNamed(*it); - addLibs(pu,libNames); - // @todo: should be recursive - } - catch (NoSuchGroup) {} // missing group for processUnit in pipeline : ignored - } - return libNames; - } - catch (NoSuchGroup) { - cerr << "Warning: config file for modex has no group '" << pipeline << "' in 'Processors' : ignored" << endl; - } - catch (NoSuchList) {} // no processUnitSequence list : ignored - } - - // if no pipeline specified, go through all groups - for (ModuleConfigurationStructure::iterator it=module.begin(), - it_end=module.end(); it!=it_end; it++) - { - // ModuleConfigurationStructure is a map - addLibs((*it).second,libNames); - } - } - catch (NoSuchModule &) { - cerr << "Error: config file for modex has no module 'Processors'" << endl; - } - - return libNames; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ +/****************************************************************************** +* +* File : compile-rules.cpp +* Author : Besancon Romaric (besanconr@zoe.cea.fr) +* Created on : Fri Oct 25 2002 +* Copyright : (c) 2002 by CEA +* Version : $Id$ +* +******************************************************************************/ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "compilerExceptions.h" +#include "libautomatonCompiler/recognizerCompiler.h" + +#include "linguisticProcessing/core/LinguisticResources/AbstractResource.h" +#include "linguisticProcessing/core/LinguisticResources/LinguisticResources.h" +#include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" +#include "linguisticProcessing/client/AnalysisHandlers/BowTextWriter.h" +#include "linguisticProcessing/client/AnalysisHandlers/SimpleStreamHandler.h" +// #include "common/AbstractFactoryPattern/MainFactory.h" +#include "common/tools/LimaMainTaskRunner.h" +#include "common/MediaProcessors/MediaProcessUnit.h" +#include "common/MediaProcessors/MediaAnalysisDumper.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#include "common/time/timeUtilsController.h" +#include "common/tools/FileUtils.h" + +#include "linguisticProcessing/core/Automaton/recognizer.h" +#include "linguisticProcessing/core/Automaton/automatonReaderWriter.h" +#include "linguisticProcessing/core/Automaton/automatonCommon.h" // for exceptions +#include "common/LimaCommon.h" +#include "common/MediaticData/mediaticData.h" +#include +#include +#include +#include +#include + +#include + + +using namespace std; + +using namespace Lima; +using namespace Lima::Common::XMLConfigurationFiles; +using namespace Lima::LinguisticProcessing::Automaton; +using namespace Lima::LinguisticProcessing; +using namespace Lima::Common::MediaticData; +using namespace Lima::Common::Misc; + +//**************************************************************************** +// declarations +//**************************************************************************** +// help mode & usage +static const string USAGE("usage : compile-rules [-h] -ooutputfile rulesfile\n"); + +static const string HELP("A compiler for the rules of the Named Entities recognizer\n" + +USAGE + +"\n" ++"-h : this help page\n" ++"--output=file : name of the output file for the compiled rules\n" +// +"(or -ofile)\n" ++"\n" ++"--language=... : specify the language of the recognizer\n" ++"--modex=... : specify the name of the modex config file\n" ++"--pipeline=... : specify the name of the pipeline for the modex\n" ++"--configDir=... : specify the directory to find the config files (default is $LIMA_CONF)\n" ++"--resourcesDir=... : specify the directory to find the resources (default is $LIMA_RESOURCES)\n" ++"--common-config-file=... : = Optional. Default is lima-common.xml\n" ++"--lp-config-file=... : = Optional. Default is lima-analysis.xml\n" ++"--encoding=... : specify the encoding of the rules file\n" ++"--useDictionary : uses a dictionary to reorganize rules\n" ++"--debug : compiles in debug mode\n" ++"\n" ++"--listTriggers : list the triggers with the corresponding offest\n" ++"--bin (or -r) : read a binary file containing compiled rules : if \n" ++" the --listTriggers is not set, print the rules on stdout\n" ++"\n" ++"rulesfile is the name of the file containing the rules in plain text\n"); + +//**************************************************************************** +#define DEFAULT_COMMON_CONFIG "lima-common.xml" +#define DEFAULT_LP_CONFIG "lima-analysis.xml" +#define DEFAULT_ENCODING "utf8" + +//**************************************************************************** +// GLOBAL variable -> the command line arguments +struct Param +{ + string inputRulesFile; // name of the rules file + string outputFile; // name of the output file for the compiled rules + string resourcesDir; // directory for resources + string configDir; // directory for config files + string commonConfigFile; // config file for linguisticData + string lpConfigFile; // config file for linguistic processing + string modexConfigFile; // config file for modex + string pipeline; // pipeline for modex (defined in config file) + string language; // language of the files + string encoding; // default encoding of rules files + bool decompile; // reads compiled rules + bool listTriggers; // list the triggers with their associated index + bool useDictionary; // use a dictionary to reorganize rules + bool loadPossibleTypes;// force loading of possible types + bool debug; // compile in debug mode (store rule ids for debug purposes) + bool help; // help mode +} +param={"", + "", + "", + "", + DEFAULT_COMMON_CONFIG, + DEFAULT_LP_CONFIG, + "", + "", + "", + DEFAULT_ENCODING, + false, + false, + false, + false, + false, + false}; + +void readCommandLineArguments(uint64_t argc, char *argv[]) +{ +// bool languageSpecified(false); + for(uint64_t i(1); i= argc) + { + std::cerr << "no output filename given" << endl; + cerr << USAGE << endl; + exit(1); + } + else + { + param.outputFile = argv[i]; + } + } + } + else if (s.find("--output=",0)==0) + { + param.outputFile=s.substr(9,s.length()-9); + } + else if (s.find("--modex=",0)==0) + { + param.modexConfigFile=string(s,8); + } + else if (s.find("--pipeline=",0)==0) + { + param.pipeline=string(s,11); + } + else if (s.find("--debug",0)==0) + { + param.debug=true; + } + else if (s[0]=='-') + { + std::cerr << "unrecognized option " << s << endl; + cerr << USAGE << endl; + exit(1); + } + else + { // file names + param.inputRulesFile=s; + } + } + + //ensure all needed parameters are set + if (param.language.empty()) { + cerr << "Error: missing --language=.. argument " << endl; + exit(1); + } + +} + +std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, const std::string& pipeline); + +//**************************************************************************** +// M A I N +//**************************************************************************** +#include "common/tools/LimaMainTaskRunner.h" +#ifdef ANTINNO_SPECIFIC +#include "common/AbstractFactoryPattern/antinno.LibraryLoader.class.h" +#else +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#endif +#include + +int run(int aargc,char** aargv); + +int main(int argc, char **argv) +{ + QCoreApplication a(argc, argv); + + // Task parented to the application so that it + // will be deleted by the application. + Lima::LimaMainTaskRunner* task = new Lima::LimaMainTaskRunner(argc, argv, run, &a); + + // This will cause the application to exit when + // the task signals finished. + QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); + + // This will run the task from the application event loop. + QTimer::singleShot(0, task, SLOT(run())); + + return a.exec(); + +} + + +int run(int argc,char** argv) +{ + readCommandLineArguments(argc,argv); + + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + if (!param.configDir.empty()) + { + configPath = QString::fromUtf8(param.configDir.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + if (!param.resourcesDir.empty()) + { + resourcesPath = QString::fromUtf8(param.resourcesDir.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } +#ifdef ANTINNO_SPECIFIC + + + + + { + std::string configDir; + + if (param.configDir.empty()) + { + if ((::std::getenv("AMOSE_CONF")) == NULL) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + else + { + configDir = ::std::getenv("AMOSE_CONF"); + } + } + else + { + configDir = param.configDir; + } + + try + { + ::std::string const file = configDir + "/plugins.txt"; + Lima::antinno::LibraryLoader().loadFromFile(file); + } + catch (::std::exception const& ex) + { + std::cerr << "Exception during plugins loading. " << ex.what() << std::endl; + return EXIT_FAILURE; + } + + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + //QsLogging::antinno::log = pLog1; + QsLogging::antinno::log = pLog1; + if (!QsLogging::Categories::instance().configure(log4cppFilePath.data())) + { + std::cerr << "Configure Problem " << log4cppFilePath << std::endl; + return EXIT_FAILURE; + } + + ::std::cout << "Plugins initialized" << ::std::endl; + } +#else + QsLogging::initQsLog(configPath); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); +#endif + + + + deque langs; + langs.push_back(param.language); + + // initialize linguisticData +// try + { + // initialize common + LOGINIT("Automaton::Compiler"); + LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ")..."; + MediaticData::changeable().init( + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), + param.commonConfigFile, + langs); + LDEBUG << "main: MediaticData::changeable().init( " << param.resourcesDir << ") done!"; + + /* + * @TODO eviter l'initialisation des ressources dans compiles rules + * On est oblige d'initialiser les ressources, juste pour recuperer un + * Recognizer vide. Il faut pouvoir creer un Recognizer dans avoir a + * initialiser les ressources + */ + + // initialize linguistic processing resources + MediaId language = MediaticData::single().media(param.language); + + bool languageInitialized = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + param.lpConfigFile.c_str()).exists()) + { + XMLConfigurationFileParser lpconfig((configDir + "/" + param.lpConfigFile.c_str()).toUtf8().constData()); + const string& langConfigFile=lpconfig.getModuleGroupParamValue("lima-coreclient","mediaProcessingDefinitionFiles",param.language); + XMLConfigurationFileParser langParser((configDir + "/" + langConfigFile.c_str()).toUtf8().constData()); + ModuleConfigurationStructure& module=langParser.getModuleConfiguration("Resources"); + LinguisticResources::changeable().initLanguage( + language, + module, + false); // don't load mainkeys in stringpool, no use + languageInitialized = true; + } + } + if(!languageInitialized) + { + LOGINIT("Automaton::Compiler"); + LERROR << "No language was configured configured with" << configDirs + << "and" << param.lpConfigFile.c_str(); + return EXIT_FAILURE; + } + + AbstractResource* resReco = LinguisticResources::single().getResource(language,"automatonCompiler"); + + Recognizer& reco = *(static_cast< Recognizer* >(resReco)); + + // look at the modex config file to find the dynamic libraries that must be loaded + if (! param.modexConfigFile.empty()) { + LOGINIT("Automaton::Compiler"); + LDEBUG << "use modex file " << param.modexConfigFile; + bool modexInitialized = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + param.modexConfigFile.c_str()).exists()) + { + XMLConfigurationFileParser modexconfig((configDir + "/" + param.modexConfigFile.c_str()).toUtf8().constData()); + vector libraries=getDynamicLibraryNames(modexconfig,param.pipeline); + for (vector::const_iterator it=libraries.begin(),it_end=libraries.end();it!=it_end; it++) + { + LOGINIT("Automaton::Compiler"); + LDEBUG << "load library " << *it; + Common::DynamicLibrariesManager::changeable().loadLibrary(*it); + } + modexInitialized = true; + } + } + if(!modexInitialized) + { + LOGINIT("Automaton::Compiler"); + LERROR << "No modex plugin was loaded with" << configDirs + << "and" << param.modexConfigFile.c_str(); + return EXIT_FAILURE; + } + } + //Recognizer reco; + // if the rules file is in binary format and we want to print its content + if (param.decompile) + { + try + { + //reco.readFromFile(param.inputRulesFile); + AutomatonReader reader; + reader.readRecognizer(param.inputRulesFile,reco); + + if (! param.listTriggers) + { + cout << reco; + } + } + catch (exception& e) + { + std::cerr << "Error while reading rules file: " << e.what() << endl; + exit(1); + } + } + else + { + // read the rules file in text format + //try + { + // Lima::TimeUtilsController *ctrl2 = new Lima::TimeUtilsController("read file and build recognizer", true); + // Lima::TimeUtilsController("read file and build recognizer", true); + std::cerr << "\rBuilding recognizer…"; + RecognizerCompiler::setRecognizerEncoding(param.encoding); + RecognizerCompiler compiler(param.inputRulesFile); + compiler.buildRecognizer(reco,language); + // delete ctrl2; + } + /*catch (exception& e) + { + std::cerr << "recognizer construction failed:"<< e.what() << endl; + exit(1); + }*/ + + // if we want to use a dictionary to reorganize rules + if (param.useDictionary) + { + // Lima::TimeUtilsController("useDictionary", true); + try + { + + string dicostr = "mainDictionary"; + AbstractResource* res= LinguisticResources::single().getResource(language,dicostr); + + AnalysisDict::AbstractAnalysisDictionary* dico = static_cast< AnalysisDict::AbstractAnalysisDictionary* >(res); + if (dico==0) + { + throw runtime_error("dictionary not available for language "+ + param.language); + } + // Reorganization not available + // reco.reorganizeRules(*dico); + } + // when character is searched out of text buffer + catch (std::exception& e) { + std::cerr << "Error: " << e.what() << endl; + } + } + + // write recognizer to file + try + { + if (! param.outputFile.empty()) + { + std::cerr << "\rWriting recognizer…"; + AutomatonWriter writer; + LINFO << "writer.WritingRecognizer(language:" << language << "debug:" << param.debug << ")"; + writer.writeRecognizer(reco,param.outputFile,language,param.debug); + //reco.writeToFile(param.outputFile); + } + } + catch (Lima::LinguisticProcessing::Automaton::OpenFileException& e) + { + std::cerr << "OpenFileException: " << e.what() << endl; exit(1); + } + } + + if (param.listTriggers) + { + reco.listTriggers(); + } + + } +// catch (InvalidConfiguration& e) +// { +// std::cerr << "Caught InvalidConfiguration: " << e.what() << std::endl; +// throw e; +// } +// catch (NoSuchModule &) +// { +// std::cerr << e.what() << std::endl; +// } +// catch (NoSuchGroup& e) +// { +// std::cerr << e.what() << std::endl; +// } +// catch (NoSuchParam& ) +// { +// std::cerr << e.what() << std::endl; +// } + TIMELOGINIT; + TimeUtils::logAllCumulatedTime("And at last"); + + + return EXIT_SUCCESS; +} + +//----------------------------------------------------------------------------------------------- +//----------------------------------------------------------------------------------------------- +void addLibs(GroupConfigurationStructure& group, + std::vector& libNames) +{ + try { + std::string libs=group.getAttribute("lib"); + std::string::size_type begin=0; + std::string::size_type i=libs.find(",",begin); + while (i!=std::string::npos) { + libNames.push_back(string(libs,begin,i-begin)); + begin=i+1; + i=libs.find(",",begin); + } + libNames.push_back(string(libs,begin)); + } + catch (NoSuchAttribute& ) {} // do nothing: optional +} + +std::vector getDynamicLibraryNames(XMLConfigurationFileParser& parser, + const std::string& pipeline) +{ + vector libNames; + try { + ModuleConfigurationStructure& module=parser.getModuleConfiguration("Processors"); + + if (! pipeline.empty()) { + // search libs for given pipeline + try { + GroupConfigurationStructure group=module.getGroupNamed(pipeline); + addLibs(group,libNames); + // do it for all groups included in pipeline + deque& processUnits=group.getListsValueAtKey("processUnitSequence"); + for (deque::const_iterator it=processUnits.begin(),it_end=processUnits.end(); it!=it_end; it++) + { + try { + GroupConfigurationStructure pu=module.getGroupNamed(*it); + addLibs(pu,libNames); + // @todo: should be recursive + } + catch (NoSuchGroup) {} // missing group for processUnit in pipeline : ignored + } + return libNames; + } + catch (NoSuchGroup) { + cerr << "Warning: config file for modex has no group '" << pipeline << "' in 'Processors' : ignored" << endl; + } + catch (NoSuchList) {} // no processUnitSequence list : ignored + } + + // if no pipeline specified, go through all groups + for (ModuleConfigurationStructure::iterator it=module.begin(), + it_end=module.end(); it!=it_end; it++) + { + // ModuleConfigurationStructure is a map + addLibs((*it).second,libNames); + } + } + catch (NoSuchModule &) { + cerr << "Error: config file for modex has no module 'Processors'" << endl; + } + + return libNames; +} diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp index e0bd6f31c..26970e184 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.cpp @@ -27,6 +27,7 @@ ************************************************************************/ #include "automatonCompiler.h" +#include "gazeteer.h" #include "transitionCompiler.h" #include "compilerExceptions.h" #include "linguisticProcessing/core/Automaton/automatonCommon.h" @@ -49,6 +50,7 @@ namespace AutomatonCompiler { /***********************************************************************/ Automaton buildAutomaton(const AutomatonString& automatonString, MediaId language, + const std::vector& gazeteers, SearchGraphSense sense, const std::vector& activeEntityGroups) { AUCLOGINIT; @@ -65,7 +67,7 @@ Automaton buildAutomaton(const AutomatonString& automatonString, else { // LDEBUG << "automatonString is: " << automatonString; Tstate finalState=buildAutomaton(a,automatonString, - initialState,currentId,language, + initialState,currentId,language,gazeteers, activeEntityGroups); // Lima::TimeUtilsController* ctrlAF = new Lima::TimeUtilsController("make final", true); // LDEBUG << "final state is " << finalState; @@ -111,6 +113,7 @@ Tstate buildAutomaton(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups) { #ifdef DEBUG_LP @@ -133,7 +136,7 @@ Tstate buildAutomaton(Automaton& a, // TODO: check if we have to handle modifiers of numbering like first, next and last in currentId while (min > 0) { // must be there x times -> insert it as non-optional - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); min--; if (max != AutomatonString::INFINITE_OCC) { max--; @@ -148,12 +151,12 @@ Tstate buildAutomaton(Automaton& a, // add the epsilon-transition from first to last (for minOcurrences=0) // and insert again the automaton from last to first // (to avoid epsilon-cycles) - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(optInitialState,finalState,new EpsilonTransition()); Tstate tmpFinalState(finalState); Tstate tmpReturnState = buildAutomatonNotOptional(a,automatonString, - tmpFinalState,currentId,language,activeEntityGroups); + tmpFinalState,currentId,language,gazeteers,activeEntityGroups); //a.addTransition(tmpReturnState,optInitialState,new EpsilonTransition()); a.addTransition(tmpReturnState,finalState,new EpsilonTransition()); @@ -168,7 +171,7 @@ Tstate buildAutomaton(Automaton& a, // insert it as non-optional as many times as necessary // and add the epsilon-transition while (max > 0) { - finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,activeEntityGroups); + finalState = buildAutomatonNotOptional(a,automatonString,finalState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(optInitialState,finalState,new EpsilonTransition()); max--; } @@ -176,7 +179,7 @@ Tstate buildAutomaton(Automaton& a, return finalState; } else { - return buildAutomatonNotOptional(a,automatonString,initialState,currentId,language,activeEntityGroups); + return buildAutomatonNotOptional(a,automatonString,initialState,currentId,language,gazeteers,activeEntityGroups); } } @@ -184,6 +187,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& initialId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups) { #ifdef DEBUG_LP @@ -204,7 +208,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, it=automatonString.getParts().begin(), it_end=automatonString.getParts().end(); for (; it!=it_end; it++) { - Tstate altFinalState=buildAutomaton(a,*it,initialState,currentId,language,activeEntityGroups); + Tstate altFinalState=buildAutomaton(a,*it,initialState,currentId,language,gazeteers,activeEntityGroups); a.addTransition(altFinalState,finalState,new EpsilonTransition()); // id??? } return finalState; @@ -224,7 +228,7 @@ Tstate buildAutomatonNotOptional(Automaton& a, for (; it!=it_end; it++, subCount++) { std::string currentId(initialId); currentId.append(".").append(std::to_string(static_cast(subCount))); - seqfinalState=buildAutomaton(a,*it,seqInitialState,currentId,language,activeEntityGroups); + seqfinalState=buildAutomaton(a,*it,seqInitialState,currentId,language,gazeteers,activeEntityGroups); seqInitialState=seqfinalState; } return seqfinalState; @@ -234,7 +238,11 @@ Tstate buildAutomatonNotOptional(Automaton& a, #ifdef DEBUG_LP LDEBUG << "is unit "; #endif - TransitionUnit *t = createTransition(automatonString,language,initialId,activeEntityGroups); + TransitionUnit* t; +#ifdef DEBUG_LP + LDEBUG << "buildAutomatonNotOptional: createSimpleTransition from " << automatonString.getString(); +#endif + t = createTransition(automatonString,language,initialId,activeEntityGroups); if (t != 0) { Tstate finalState = a.addState(); a.addTransition(initialState, finalState, t); @@ -244,6 +252,48 @@ Tstate buildAutomatonNotOptional(Automaton& a, throw AutomatonErrorException("attempt to insert empty transition\n"); } } + // We do not yet know how to use gazetteer with any element defined with a category or with space chrecter + else if (automatonString.isSimpleGazeteer()) { +#ifdef DEBUG_LP + LDEBUG << "is simpleGazeteer "; +#endif + const LimaString& unitString = automatonString.getUnitString(); + const LimaString& gazeteerName = unitString.mid(1,unitString.size()-1); +// OME LimaString gazeteerName = automatonString.getUnitString().mid(1,automatonString.getString().size()-1); +// OME int i; +// OME for (i=0; i gazeteerAsVectorOfString = gazeteer; +// OME #ifdef DEBUG_LP +// OME LDEBUG << "buildAutomatonNotOptional: new GazeteerTransition from " << gazeteer.alias(); +// OME #endif + // t = createGazeteerTransition(automatonString,language,initialId,activeEntityGroups,gazeteerAsVectorOfString,true); + // DONE?: replace new GazeteerTransition by createTransition.... + // t = new GazeteerTransition(gazeteerAsVectorOfString,gazeteer.alias(),true); + // TransitionUnit* trigger = new GazeteerTransition(gazeteerAsVectorOfString,gazeteerName,keepTrigger); */ + // TODO, vérifier que + // - gérer les "constraints" + TransitionUnit* t = createGazeteerTransition(gazeteerName, + language, initialId, activeEntityGroups, + gazeteers,automatonString.isKept(),false); + + if (t != 0) { + const std::vector& constraints = automatonString.getConstraints(); + for (std::size_t i(0); iaddConstraint(constraints[i]); + } + Tstate finalState = a.addState(); + a.addTransition(initialState, finalState, t); + return finalState; + } + else { + throw AutomatonErrorException("attempt to insert empty transition\n"); + } + } return initialState; } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h index c934cd27b..dec576eed 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonCompiler.h @@ -46,6 +46,7 @@ namespace AutomatonCompiler { // use directly automaton string LIMA_AUTOMATONCOMPILER_EXPORT Automaton buildAutomaton(const AutomatonString& automatonString, MediaId language, + const std::vector& gazeteers, SearchGraphSense sense, const std::vector& activeEntityGroups); @@ -53,12 +54,14 @@ namespace AutomatonCompiler { const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups); LIMA_AUTOMATONCOMPILER_EXPORT Tstate buildAutomatonNotOptional(Automaton& a, const AutomatonString& automatonString, const Tstate& initialState, const std::string& currentId, MediaId language, + const std::vector& gazeteers, const std::vector& activeEntityGroups); } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp index 96ede9264..d4b4e5319 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.cpp @@ -717,6 +717,10 @@ void AutomatonString::removeArtificialSequences(const bool inSubPart) { // LDEBUG << "trying to remove : " << (*part).getString(); part=m_parts.erase(part); // LDEBUG << "has erased artificial : " << getString(); +#ifdef ANTINNO_BUGFIX + // spécique lib std microsoft + if (part != m_parts.begin()) +#endif part--; // erase returns iterator following the one erased } // else if (inSubPart) { @@ -968,10 +972,26 @@ void AutomatonString::parseUnit(const LimaString& str, oss << "unknown class " << Common::Misc::limastring2utf8stdstring(str.mid(newBegin+1,newSize-1)); throw AutomatonCompilerException(oss.str()); } - // copy only type, parts and unit (other are set by modifiers) - setType((*it).getAutomatonString().getType()); - m_parts=(*it).getAutomatonString().getParts(); - m_unit=(*it).getAutomatonString().getUnitString(); + const Gazeteer& gazeteer = *it; + //if( !gazeteer.hasMultiTermWord() && gazeteer.hasNoCategoryNorTstatus() ) { + if( gazeteer.hasNoCategoryNorTstatus() ) { +#ifdef DEBUG_LP + LDEBUG << "AutomatonString: set type(SIMPLE_GAZETEER)"; +#endif + setType(SIMPLE_GAZETEER); + // m_parts is empty!; + // m_unit=gazeteer.getName(); + m_unit=str.mid(newBegin,newSize); + } + else { + // copy only type, parts and unit (other are set by modifiers) +#ifdef DEBUG_LP + LDEBUG << "AutomatonString: set type(" << (*it).getAutomatonString().getType() << ")"; +#endif + setType((*it).getAutomatonString().getType()); + m_parts=(*it).getAutomatonString().getParts(); + m_unit=(*it).getAutomatonString().getUnitString(); + } } else if (str[newBegin] == CHAR_BEGIN_NAMESUB) { #ifdef DEBUG_LP @@ -1062,6 +1082,9 @@ LimaString AutomatonString::getString() const { case UNIT: { return applyModifiers(m_unit); } + case SIMPLE_GAZETEER: { + return applyModifiers(m_unit); + } case SEQUENCE: { LimaString str; if (m_parts.size()) { diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h index da50f2d46..87e45d67e 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/automatonString.h @@ -54,6 +54,7 @@ typedef enum { UNKNOWN_TYPE, UNIT, SEQUENCE, + SIMPLE_GAZETEER, ALTERNATIVE } ElementType; @@ -125,6 +126,7 @@ class AutomatonString bool isUnit() const; bool isSequence() const; bool isAlternative() const; + bool isSimpleGazeteer() const; bool isArtificialSequence() const; //only for construction bool isSplittedFirst() const { return m_isSplittedFirst; } @@ -346,6 +348,9 @@ inline bool AutomatonString::isArtificialSequence() const { inline bool AutomatonString::isUnit() const { return (m_type == UNIT); } +inline bool AutomatonString::isSimpleGazeteer() const { + return (m_type == SIMPLE_GAZETEER); +} inline bool AutomatonString::isSequence() const { return (m_type == SEQUENCE); } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp index 53ba484d0..50c4251de 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.cpp @@ -51,13 +51,18 @@ namespace Automaton { Gazeteer::Gazeteer(): std::vector(0), m_alias(), -m_automatonString() { +m_automatonString(), +m_hasMultiTermWord(false), +m_hasNoCategoryNorTstatus(true) +{ } Gazeteer::Gazeteer(const Gazeteer& g): std::vector(g), m_alias(g.m_alias), -m_automatonString(g.m_automatonString) +m_automatonString(g.m_automatonString), +m_hasMultiTermWord(g.m_hasMultiTermWord), +m_hasNoCategoryNorTstatus(g.m_hasNoCategoryNorTstatus) { } @@ -75,6 +80,8 @@ Gazeteer& Gazeteer::operator = (const Gazeteer& g) { std::vector::operator=(g); m_alias = g.alias(); m_automatonString=g.m_automatonString; + m_hasMultiTermWord=g.m_hasMultiTermWord; + m_hasNoCategoryNorTstatus=g.m_hasNoCategoryNorTstatus; } return (*this); } @@ -88,6 +95,23 @@ Gazeteer& Gazeteer::add(const Gazeteer& g) { } /***********************************************************************/ +/***********************************************************************/ +// add a word in the inherited std::vector +// check if word is simple word (no category, no Tstatus) +/***********************************************************************/ +void Gazeteer::addWord(const LimaString& s) { + if( (s.startsWith(STRING_TSTATUS_TR)) + || (s.startsWith(STRING_TSTATUS_TR_small)) + || (s.contains(CHAR_POS_TR)) ) + { + resetCategoryOrTstatusFlag(); + } + if( s.contains(CHAR_SEP_RE) ) { + setHasMultiTermWordFlag(); + } + push_back(s); +} + /***********************************************************************/ // build the automatonString corresponding to the gazeteer diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h index 8702e2229..54c478454 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/gazeteer.h @@ -72,8 +72,15 @@ class Gazeteer : public std::vector LimaString readName(RecognizerCompiler& reco); void readValues(RecognizerCompiler& reco, const LimaString& stringBegin=LimaString()); + bool hasMultiTermWord() const { return m_hasMultiTermWord; } + bool hasNoCategoryNorTstatus() const { return m_hasNoCategoryNorTstatus; } + void resetCategoryOrTstatusFlag() { m_hasNoCategoryNorTstatus = false; } + void setHasMultiTermWordFlag() { m_hasMultiTermWord = true; } + private: LimaString m_alias; + bool m_hasMultiTermWord; + bool m_hasNoCategoryNorTstatus; AutomatonString m_automatonString; }; @@ -81,7 +88,6 @@ class Gazeteer : public std::vector // inline access functions /***********************************************************************/ inline uint64_t Gazeteer::numberOfWords() const { return size(); } -inline void Gazeteer::addWord(const LimaString& s) { push_back(s); } inline const LimaString& Gazeteer::alias() const { return m_alias; } inline void Gazeteer::setAlias(const LimaString& a) { m_alias = a; } diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp index e98a7cb53..2e237bb31 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/recognizerCompiler.cpp @@ -33,6 +33,7 @@ #include "tstring.h" #include "common/LimaCommon.h" #include "common/Data/strwstrtools.h" +#include "common/tools/FileUtils.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/time/timeUtilsController.h" @@ -66,7 +67,7 @@ m_stream(0), m_nbRule(0) { AUCLOGINIT; - LINFO << "Opening recognizer compiler with file " << filename; + LDEBUG << "Opening recognizer compiler with file " << filename; m_stream=new ifstream(filename.c_str(), std::ifstream::binary); if (! m_stream || !m_stream->good()) { LERROR << "Cannot open file [" << filename << "]"; @@ -176,9 +177,9 @@ void RecognizerCompiler::buildRecognizer(Recognizer& reco, next=findSpecialCharacter(s,CHAR_SEP_LIST,begin); LimaString str = s.mid(begin,(next==-1)?next:next-begin); // initialize entities - string filename=Common::MediaticData::MediaticData::single().getConfigPath()+"/"+ - Misc::limastring2utf8stdstring(str); - XMLConfigurationFiles::XMLConfigurationFileParser parser(filename); + + QString filename = Common::Misc::findFileInPaths(Common::MediaticData::MediaticData::single().getConfigPath().c_str(),str); + XMLConfigurationFiles::XMLConfigurationFileParser parser(filename.toUtf8().constData()); MediaticData::MediaticData::changeable().initEntityTypes(parser); begin=next+1; } while (next != -1); @@ -377,9 +378,13 @@ void RecognizerCompiler::buildRecognizer(Recognizer& reco, LERROR << message.str(); } */ - LINFO << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" + LDEBUG << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" << ": trigger=" << *trigger; reco.addRule(trigger,r); +#ifdef DEBUG_LP + LDEBUG << "rule[" << m_nbRule << "]=" << *r; +#endif + m_nbRule++; delete trigger; } @@ -465,7 +470,11 @@ readSubAutomaton(const LimaString& line, } //********************************************************************** -// add a rule with a gazeteer trigger -> multiply the rules +// add a rule with a gazeteer trigger -> +// 1) create a rule and multiply the reference to this rule in the index +// of recognizer (transition with 1 entry,rule) +// 2) create a a gazeteerTransition and create only one entry in the index +// of recognizer (gazeteerTransition,rule) //********************************************************************** void RecognizerCompiler:: addRuleWithGazeteerTrigger(const LimaString& gazeteerName, @@ -479,64 +488,82 @@ addRuleWithGazeteerTrigger(const LimaString& gazeteerName, const bool headTrigger) { AUCLOGINIT; - // Lima::TimeUtilsController* ctrl4 = new Lima::TimeUtilsController("addRuleWithGazeteerTrigger", true); - // identify class alias -// int endTrigger(findSpecialCharacter(s,CHAR_SEP_RULE,1)); -// Tword classAlias(s.mid(1,endTrigger-1)); -// s=s.mid(endTrigger+1); // find gazeteer - // Lima::TimeUtilsController* ctrl41 = new Lima::TimeUtilsController("before init Rule inside addRuleWithGazeteerTrigger", true); - std::size_t i; - for (i=0; i0 ) { - // the class has been found - // only one rule and all triggers point to this rule - Rule* r=new Rule; - - //expandGazeteersInRule(ruleString,gazeteers); - //expandSubAutomatonsInRule(ruleString,subAutomatons); - - // check if there are agreement constraints on following lines - // and add them at end of the rule if there are - ruleString=ruleString+peekConstraints(*m_stream); - ruleString=ruleString+defaultAction; - - // add the trigger to deal with agreement constraints - LimaString triggerString=gazeteers[i][0]; - if (! keepTrigger) { - triggerString=CHAR_NOKEEP_OPEN_RE+triggerString+CHAR_NOKEEP_CLOSE_RE; + // gazeteer not found + if ( gazeteerIndex >= gazeteers.size() || gazeteers[gazeteerIndex].size() == 0 ) { + string str=Misc::limastring2utf8stdstring(gazeteerName); + if (gazeteerIndexsetWeight(currentRuleWeight()); - LINFO << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" - << ": multiple trigger (first is "<setWeight(currentRuleWeight()); + LDEBUG << "Adding rule no " << m_nbRule << "(" << r->getRuleId() << ")" + << ": multiple trigger (first is "<& gazeteerAsVectorOfString = gazeteer; + // TransitionUnit* trigger = new GazeteerTransition(gazeteerAsVectorOfString,gazeteerName,keepTrigger); */ + TransitionUnit* trigger = createGazeteerTransition(gazeteerName, + language, currentId, m_activeEntityGroups, + gazeteers,keepTrigger,headTrigger); + if (trigger != 0) + { + //copy the properties of the trigger of the rule + trigger->copyProperties(*(r->getTrigger())); + reco.addRule(trigger,indexRule); + //LINFO << nbRule << ": trigger=" << *trigger; + delete trigger; // it has been copied + } + } + else + { + for (std::size_t j(0); j& activeEntityGroups, + const vector& gazeteers, + const bool keep, + const bool head) +{ + int gazeteerIndex; + for (gazeteerIndex=0; gazeteerIndex= gazeteers.size() || gazeteers[gazeteerIndex].size() == 0 ) { + AUCLOGINIT; + string str=Misc::limastring2utf8stdstring(gazeteerName); + if (gazeteerIndex& gazeteerAsVectorOfString = gazeteer; + // TODO bool negative = automatonString.isNegative()??, Est-ce qu'on autorise un trigger avec une négation? + bool negative(false); + TransitionUnit* t = new GazeteerTransition(gazeteerAsVectorOfString, gazeteerName, keep); + t->setNegative(negative); + t->setHead(head); + t->setId(id); + return t; +} + TransitionUnit* createTransition(const AutomatonString& automatonString, MediaId language, const std::string& id, @@ -83,7 +124,9 @@ TransitionUnit* createTransition(const LimaString str, const std::vector& activeEntityGroups, const bool keep, const bool neg, - const std::vector& constraints) + const std::vector& constraints, + const std::vector& gazeteerAsVectorOfString + ) { #ifdef DEBUG_LP AUCLOGINIT; @@ -99,7 +142,7 @@ TransitionUnit* createTransition(const LimaString str, FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(language); #ifdef DEBUG_LP - LDEBUG << "creating transition from string [" + LDEBUG << "createTransition: creating transition from string [" << Common::Misc::limastring2utf8stdstring(str) << "] with id" << id; #endif @@ -143,7 +186,7 @@ TransitionUnit* createTransition(const LimaString str, if (s[0] == CHAR_NOKEEP_OPEN_TR) { if (s[s.length()-1] != CHAR_NOKEEP_CLOSE_TR) { AUCLOGINIT; - LERROR << "confused by no_keep format (maybe incomplete) :" + LERROR << "createTransition: confused by no_keep format (maybe incomplete) :" << Common::Misc::limastring2utf8stdstring(str); } else { @@ -239,6 +282,14 @@ TransitionUnit* createTransition(const LimaString str, t = createDefaultTStatusTransition(s,LENGTH_TSTATUS_TR); } // ---------------------------------------------------------------------- + // GazeteerTransition: form belongs to gazeteer + /* + else if (s.indexOf(CHAR_BEGIN_NAMEGAZ,0) == 0) { + // name of gazeteer already identified! + t = new GazeteerTransition(gazeteerAsVectorOfString,alias,keep); + } + */ + // ---------------------------------------------------------------------- // * transition else if (s == STRING_ANY_TR) { t = new StarTransition(); @@ -246,14 +297,24 @@ TransitionUnit* createTransition(const LimaString str, // ---------------------------------------------------------------------- // entity transition else if (s.size()>=2 && s[0]==CHAR_BEGIN_ENTITY && s[s.size()-1]==CHAR_END_ENTITY) { - Common::MediaticData::EntityType type= - resolveEntityName(s.mid(1,s.size()-2),activeEntityGroups); + LimaString entityName(s.mid(1,s.size()-2)); + Common::MediaticData::EntityType type=resolveEntityName(entityName,activeEntityGroups); if (type.isNull()) { - AUCLOGINIT; - LERROR << "cannot resolve entity name " - << Common::Misc::limastring2utf8stdstring(s); + Common::MediaticData::EntityGroupId groupId = resolveGroupName(entityName,activeEntityGroups); + if( groupId == 0) { + AUCLOGINIT; + LERROR << "createTransition: cannot resolve entity name " + << Common::Misc::limastring2utf8stdstring(s); + } + else { + AUCLOGINIT; + LDEBUG << "createTransition: create EntityGroupTransition(" << groupId << ")"; + t=new EntityGroupTransition(groupId); + } } else { + AUCLOGINIT; + LDEBUG << "createTransition: create EntityTransition(" << type << ")"; t=new EntityTransition(type); } } @@ -283,6 +344,46 @@ TransitionUnit* createTransition(const LimaString str, return t; } +//********************************************************************** +// +Common::MediaticData::EntityGroupId +resolveGroupName(const LimaString s, + const std::vector& activeEntityGroups) +{ +#ifdef DEBUG_LP + AUCLOGINIT; + LDEBUG << "resolveGroupName: try to resolve group name " + << Common::Misc::limastring2utf8stdstring(s); +#endif + Common::MediaticData::EntityGroupId foundGroup; + try { + LimaString groupName=s; +#ifdef DEBUG_LP + LDEBUG << "resolveGroupName: try group name " << Common::Misc::limastring2utf8stdstring(s); +#endif + foundGroup = Common::MediaticData::MediaticData::single().getEntityGroupId(groupName); + // group is among active groups +#ifdef DEBUG_LP + LDEBUG << "resolveGroupName: foundGroup" << foundGroup; +#endif + for (vector::const_iterator it=activeEntityGroups.begin(), + it_end=activeEntityGroups.end(); it!=it_end; it++) { + if( groupName == *it ) { + return foundGroup; + } + AUCLOGINIT; + LERROR << "resolveGroupName: group " << Common::Misc::limastring2utf8stdstring(s) << " not active"; + return foundGroup; + } + } + catch (LimaException& e) { + AUCLOGINIT; + LERROR << "resolveGroupName: cannot resolve group for " + << Common::Misc::limastring2utf8stdstring(s); + } + return foundGroup; +} + //********************************************************************** // Common::MediaticData::EntityType @@ -291,21 +392,21 @@ resolveEntityName(const LimaString s, { #ifdef DEBUG_LP AUCLOGINIT; - LDEBUG << "TransitionCompiler: try to resolve entity name " + LDEBUG << "resolveEntityName: try to resolve entity name " << Common::Misc::limastring2utf8stdstring(s); #endif // test if word is a known entity name => in this case, entity transition if (s.indexOf(Common::MediaticData::MediaticData::single().getEntityTypeNameSeparator())!=-1) { #ifdef DEBUG_LP - LDEBUG << "TransitionCompiler: entity name is complete"; + LDEBUG << "resolveEntityName: entity name is complete"; #endif try { return Common::MediaticData::MediaticData::single().getEntityType(s); } catch (LimaException& e) { AUCLOGINIT; - LERROR << "unknown entity " << s; + LERROR << "resolveEntityName: unknown entity " << s; } } else { // try to find this entity in active groups @@ -315,14 +416,14 @@ resolveEntityName(const LimaString s, try { LimaString entityName=(*it)+Common::MediaticData::MediaticData::single().getEntityTypeNameSeparator()+s; #ifdef DEBUG_LP - LDEBUG << "TransitionCompiler: try entity name " << Common::Misc::limastring2utf8stdstring(entityName); + LDEBUG << "resolveEntityName: try entity name " << Common::Misc::limastring2utf8stdstring(entityName); #endif Common::MediaticData::EntityType findType= Common::MediaticData::MediaticData::single().getEntityType(entityName); if (!type.isNull()) { // there is ambiguity AUCLOGINIT; - LERROR << "cannot resolve entity group for entity " + LERROR << "resolveEntityName: cannot resolve entity group for entity " << Common::Misc::limastring2utf8stdstring(s) << " (at least two groups contain this entity)"; } @@ -333,14 +434,14 @@ resolveEntityName(const LimaString s, catch (LimaException& e) { // not in this group: do nothing (continue search) #ifdef DEBUG_LP - LDEBUG << "entity " << Common::Misc::limastring2utf8stdstring(s) + LDEBUG << "resolveEntityName: entity " << Common::Misc::limastring2utf8stdstring(s) << " not in group " << Common::Misc::limastring2utf8stdstring(*it); #endif } } - if (type.isNull()) { + if (type.isNull()) { // try to interpret s as group AUCLOGINIT; - LERROR << "cannot resolve entity group for entity " + LERROR << "resolveEntityName: cannot resolve entity group for entity " << Common::Misc::limastring2utf8stdstring(s) << " (no active group contains this entity)"; } @@ -411,7 +512,7 @@ Tpos createTpos(const std::string& s, MediaId language) { //search for separator '_' int sep(findSpecialCharacter(Common::Misc::utf8stdstring2limastring(s),CHAR_SEP_MACROMICRO_STRING,0)); if (sep != -1 && string(s,0,sep) == "L") { - // '_' found after L (L_NC) + // '_' found after L (NC) sep=findSpecialCharacter(Common::Misc::utf8stdstring2limastring(s),CHAR_SEP_MACROMICRO_STRING,sep+1); } if (sep == -1) { // only macro diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h index 96db81ddf..72ddfbffa 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/transitionCompiler.h @@ -47,6 +47,22 @@ Lima::LinguisticProcessing::Automaton::TransitionUnit* MediaId language, const std::string& id, const std::vector& activeEntityGroups); +/** + * Lima::LinguisticProcessing::Automaton::TransitionUnit* + createGazeteerTransition(const AutomatonString& automatonString, + MediaId language, const std::string& id, + const std::vector& activeEntityGroups, + const std::vector& gazeteerAsVectorOfString, + const bool keepTrigger); +*/ +Lima::LinguisticProcessing::Automaton::TransitionUnit* + createGazeteerTransition(const LimaString& gazeteerName, + MediaId language, const std::string& id, + const std::vector& activeEntityGroups, + const std::vector& gazeteers, + const bool keep=true, + const bool head=false); + Lima::LinguisticProcessing::Automaton::TransitionUnit* createTransition(const LimaString, MediaId language, const std::string& id, @@ -54,11 +70,15 @@ Lima::LinguisticProcessing::Automaton::TransitionUnit* const bool keep=true, const bool neg=false, const std::vector& constraints= - std::vector(0)); + std::vector(0), + const std::vector& gazeteerAsVectorOfString = std::vector(0) ); Common::MediaticData::EntityType resolveEntityName(const LimaString str, const std::vector& activeEntityGroups); +Common::MediaticData::EntityGroupId + resolveGroupName(const LimaString s, + const std::vector& activeEntityGroups); } // end namespace } // end namespace diff --git a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp index 05088bdd9..ad96c73cc 100644 --- a/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp +++ b/lima_linguisticprocessing/tools/automatonCompiler/libautomatonCompiler/tstring.cpp @@ -44,8 +44,7 @@ namespace Automaton { //*************************************************************************** void getlineLimaString(std::istream& in, LimaString& s) { // first get a string and convert it to wstring - std::string tmp; - getline(in,tmp); + std::string tmp = Lima::Common::Misc::readLine(in); s=Common::Misc::utf8stdstring2limastring(tmp); } diff --git a/lima_linguisticprocessing/tools/common/catBowFiles.cpp b/lima_linguisticprocessing/tools/common/catBowFiles.cpp index 7830fb450..6c86907bf 100644 --- a/lima_linguisticprocessing/tools/common/catBowFiles.cpp +++ b/lima_linguisticprocessing/tools/common/catBowFiles.cpp @@ -102,7 +102,7 @@ void readAndWriteBoWDocuments(ifstream& fileIn, BinaryWriterBoWDocumentHandler writer(fileOut); while (! fileIn.eof()) { document->reinit(); - reader.readBoWDocumentBlock(fileIn, *document, writer, false); + reader.readBoWDocumentBlock(fileIn, *document, writer, false, false); } } diff --git a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp index 093fe0601..35064419e 100644 --- a/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp +++ b/lima_linguisticprocessing/tools/common/getLexiconFromBoW.cpp @@ -165,12 +165,12 @@ LimaString getStringDecomp(boost::shared_ptr< BoWToken > token) { std::deque< BoWComplexToken::Part >::const_iterator it, it_end; boost::shared_ptr< BoWComplexToken> complexToken; switch (token->getType()) { - case BOW_TOKEN: + case BoWType::BOW_TOKEN: //cerr << "token is a simple token -> " << token->getString() << endl; return token->getString(); break; - case BOW_TERM: - case BOW_NAMEDENTITY: + case BoWType::BOW_TERM: + case BoWType::BOW_NAMEDENTITY: //cerr << "token is a complex token" << endl; complexToken=boost::dynamic_pointer_cast(token); it=complexToken->getParts().begin(); it_end=complexToken->getParts().end(); @@ -211,9 +211,9 @@ class GetLexiconBoWDocumentHandler : public AbstractBoWDocumentHandler const std::string& /*elementName*/) {} void processSBoWText(const BoWText* boWText, - bool useIterators); + bool useIterators, bool /*useIndexIterator*/); void processProperties(const Misc::GenericDocumentProperties* /*properties*/, - bool /*useIterators*/) + bool /*useIterators*/, bool /*useIndexIterator*/) {} void closeSBoWNode() {} @@ -227,7 +227,7 @@ class GetLexiconBoWDocumentHandler : public AbstractBoWDocumentHandler }; void GetLexiconBoWDocumentHandler::processSBoWText(const BoWText* text, - bool useIterators) + bool useIterators, bool /*useIndexIterator*/) { LIMA_UNUSED(useIterators); BoWTokenIterator it(*text); @@ -306,7 +306,7 @@ void readDocuments(ifstream& fileIn, BoWDocument* document, referenceProperties,filterCategory); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn,*document,handler,true); + reader.readBoWDocumentBlock(fileIn,*document,handler,true,false); } } @@ -358,8 +358,8 @@ int run(int argc,char** argv) } - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); if ( (!param.language.size()) && (!param.codeFile.size()) ) { cerr << "no codefile nor language specified !" << endl; diff --git a/lima_linguisticprocessing/tools/common/parseXMLFile.cpp b/lima_linguisticprocessing/tools/common/parseXMLFile.cpp index d6abcf7f7..8b4db3d03 100644 --- a/lima_linguisticprocessing/tools/common/parseXMLFile.cpp +++ b/lima_linguisticprocessing/tools/common/parseXMLFile.cpp @@ -128,9 +128,9 @@ int run(int argc,char** argv) exit(0); } - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); XMLConfigurationFileParser parser(param.inputFile); diff --git a/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp b/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp index df70b8a4b..967c70c20 100644 --- a/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp +++ b/lima_linguisticprocessing/tools/common/parseXMLPropertyFile.cpp @@ -23,6 +23,7 @@ #include "common/LimaCommon.h" +#include "common/Data/strwstrtools.h" #include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" #include @@ -218,10 +219,10 @@ int run(int argc,char** argv) string line; while (fin.good() && !fin.eof()) { - getline(fin,line); - if (line.size()>0) { - LinguisticCode prop(atoi(line.c_str())); - decode(propcodemanager,prop); + line = Lima::Common::Misc::readLine(fin); + if (line.size()>0) { + LinguisticCode prop(atoi(line.c_str())); + decode(propcodemanager,prop); } } } diff --git a/lima_linguisticprocessing/tools/common/readBoWFile.cpp b/lima_linguisticprocessing/tools/common/readBoWFile.cpp index c65d86214..85047fc3d 100644 --- a/lima_linguisticprocessing/tools/common/readBoWFile.cpp +++ b/lima_linguisticprocessing/tools/common/readBoWFile.cpp @@ -262,7 +262,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re TextWriterBoWDocumentHandler writer(cout); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } break; } @@ -272,7 +272,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re writer.writeBoWDocumentsHeader(); while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } writer.writeBoWDocumentsFooter(); } @@ -281,7 +281,7 @@ void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& re SBoWStatWriter writer; while (! fileIn.eof()) { - reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); + reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator, param.useIndexIterator); } cout << writer << endl; break; diff --git a/lima_linguisticprocessing/tools/common/readLinguisticData.cpp b/lima_linguisticprocessing/tools/common/readLinguisticData.cpp index 694200708..67ffee467 100644 --- a/lima_linguisticprocessing/tools/common/readLinguisticData.cpp +++ b/lima_linguisticprocessing/tools/common/readLinguisticData.cpp @@ -68,9 +68,9 @@ int run(int argc,char** argv) // Necessary to initialize factories Lima::AmosePluginsManager::single(); - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string configFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); std::deque langs; diff --git a/lima_linguisticprocessing/tools/common/testAccessMethod.cpp b/lima_linguisticprocessing/tools/common/testAccessMethod.cpp index ff528f886..acb11dada 100644 --- a/lima_linguisticprocessing/tools/common/testAccessMethod.cpp +++ b/lima_linguisticprocessing/tools/common/testAccessMethod.cpp @@ -183,9 +183,9 @@ int run(int argc,char** argv) void testAccessMethod(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); AbstractAccessByString* accessMethod(0); if (param.accessMethod == "fsa") diff --git a/lima_linguisticprocessing/tools/common/testContentDict16.cpp b/lima_linguisticprocessing/tools/common/testContentDict16.cpp index 27510fc5d..7ac81b185 100644 --- a/lima_linguisticprocessing/tools/common/testContentDict16.cpp +++ b/lima_linguisticprocessing/tools/common/testContentDict16.cpp @@ -49,6 +49,20 @@ using namespace std; #include +#define ANTINNO_SPECIFIC_LOG + +#ifdef ANTINNO_SPECIFIC_LOG +// FWI 12/05/2015 utilisation de composants d's3 +#include "antinno.s3.config.h" +#include "antinno.s3.fs.File.class.h" +#include "antinno.s3.fs.Directory.class.h" +#include "antinno.s3.fs.FileName.class.h" +#include "antinno.s3.log.Log4cpp.class.h" +#if defined WIN32 +#include "windows.h" +#endif +#endif + //#include "common/linguisticData/linguisticData.h" using namespace Lima; @@ -99,10 +113,36 @@ int main(int argc, char **argv) int run(int argc,char** argv) { +#ifndef ANTINNO_SPECIFIC_LOG QsLogging::initQsLog(); // Necessary to initialize factories Lima::AmosePluginsManager::single(); +#else + LoadLibrary("antinno.s3lib.dll"); + static ::antinno::s3::log::Log4cpp log1; + { + + using namespace ::antinno; + QString const c = ::std::getenv("AMOSE_CONF"); + if (c.isEmpty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + + QString log4cppFilePath = c + "/" + "AntTextIndexer.log4cpp"; + s3::fs::File const log4cppFile((s3::fs::Path(::boost::locale::conv::utf_to_utf(log4cppFilePath.toUtf8().constData()).c_str()))); + //::std::wcout << log4cppFile << ::std::endl; + log1.configure(log4cppFile); + ::antinno::s3::global.log(log1); + if (!QsLogging::Categories::instance().configure(log4cppFilePath.toAscii().constData())) + { + std::cerr << "Configure Problem " << log4cppFilePath.toAscii().constData() << std::endl; + return EXIT_FAILURE; + } + } +#endif cerr << "testContentDict16 begin..." << endl; setlocale(LC_ALL, ""); @@ -257,9 +297,9 @@ const Lima::LimaString& word) const{ void testAnalysisDico(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); string commonConfigFile=string("lima-common.xml"); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); MyAnalysisDico* dico = new MyAnalysisDico(analysisDataElement(0)); dico->parseAccessMethod(param.keyFileName); diff --git a/lima_linguisticprocessing/tools/common/testReadLexicon.cpp b/lima_linguisticprocessing/tools/common/testReadLexicon.cpp index d0734d0c7..d4ed34e61 100644 --- a/lima_linguisticprocessing/tools/common/testReadLexicon.cpp +++ b/lima_linguisticprocessing/tools/common/testReadLexicon.cpp @@ -47,9 +47,9 @@ Param; void testAccessMethod(const Param& param ) { - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string commonConfigFile=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string("lima-common.xml"); - string configDir=string(getenv("LIMA_CONF")); + string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string commonConfigFile=string("lima-common.xml"); + string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); // Load lexicon Lima::Common::FsaAccess::FsaAccessSpare16* fsaAccess=new Lima::Common::FsaAccess::FsaAccessSpare16(); diff --git a/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp b/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp index d58afe651..ea74e4e17 100644 --- a/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp +++ b/lima_linguisticprocessing/tools/dictionary/compileDictionary.cpp @@ -1,318 +1,360 @@ -/* - Copyright 2002-2013 CEA LIST - - This file is part of LIMA. - - LIMA is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - LIMA is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with LIMA. If not, see -*/ - -#include -#include -#include -#include -#include - -#include "common/LimaCommon.h" -#include "common/Data/strwstrtools.h" -#include "common/Data/LimaString.h" -#include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" -#include "common/FsaAccess/FsaAccessSpare16.h" -#include "common/misc/AbstractAccessByString.h" -#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" -// #include "linguisticProcessing/core/Tokenizer/ParseChar.h" -// #include "linguisticProcessing/core/Tokenizer/ParseCharClass.h" - -#include "KeysLogger.h" -#include "DictionaryHandler.h" - -#include -#include - -using namespace std; -using namespace Lima; -using namespace Lima::Common; -using namespace Lima::Common::PropertyCode; -using namespace Lima::Common::FsaAccess; -using namespace Lima::Common::Misc; -using namespace Lima::LinguisticProcessing; -using namespace Lima::LinguisticProcessing::FlatTokenizer; - -void usage() -{ - std::cerr << "USAGE : compileDictionary [OPTIONS] file" << std::endl; - std::cerr << "where [OPTIONS] are : " << std::endl; - std::cerr << " --extractKeyList= : only extract keys list to file, no compilation" << endl; - std::cerr << " --charChart= : specify charchart file" << endl; - std::cerr << " --fsaKey= : provide fsa access keys to compile" << endl; - std::cerr << " --propertyFile= : specify property coding system (xml file)" << endl; - std::cerr << " --symbolicCodes= : specify symbolic codes file (xml)" << endl; - std::cerr << " --output= : specify output file" << endl; - std::cerr << " --reverse-keys : reverse entries keys" << endl; -} - -// options -typedef struct ParamStruct -{ - std::string extractKeys; - std::string charChart; - std::string fsaKey; - std::string propertyFile; - std::string symbolicCodes; - std::string output; - std::string input; - bool reverseKeys; -} -Param; - - -#include "common/tools/LimaMainTaskRunner.h" -#include "common/AbstractFactoryPattern/AmosePluginsManager.h" -#include - -int run(int aargc,char** aargv); - -int main(int argc, char **argv) -{ - QCoreApplication a(argc, argv); - - // Task parented to the application so that it - // will be deleted by the application. - LimaMainTaskRunner* task = new LimaMainTaskRunner(argc, argv, run, &a); - - // This will cause the application to exit when - // the task signals finished. - QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); - - // This will run the task from the application event loop. - QTimer::singleShot(0, task, SLOT(run())); - - return a.exec(); - -} - - -int run(int argc,char** argv) -{ - QsLogging::initQsLog(); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - - setlocale(LC_ALL,"fr_FR.UTF-8"); - - Param param = { - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - std::string(""), - false}; - - - for (int i = 1 ; i < argc; i++) - { - std::string arg(argv[i]); - int pos = -1; - if (arg == "--help") - { - usage(); - return 0; - } - if ( (pos = arg.find("--extractKeyList=")) != -1 ) - { - param.extractKeys = arg.substr(pos+17); - } - else if ( (pos = arg.find("--fsaKey=")) != -1 ) - { - param.fsaKey = arg.substr(pos+9); - } - else if ( (pos = arg.find("--charChart=")) != -1 ) - { - param.charChart = arg.substr(pos+12); - } - else if ( (pos = arg.find("--propertyFile=")) != -1 ) - { - param.propertyFile = arg.substr(pos+15); - } - else if ( (pos = arg.find("--symbolicCodes=")) != -1 ) - { - param.symbolicCodes = arg.substr(pos+16); - } - else if ( (pos = arg.find("--output=")) != -1 ) - { - param.output = arg.substr(pos+9); - } - else if ( (pos = arg.find("--reverse-keys")) != -1 ) - { - param.reverseKeys = true; - } - else - { - param.input = arg; - } - } - - // check that input file exists - { - ifstream fin(param.input.c_str(), std::ifstream::binary); - if (!fin.good()) - { - cerr << "can't open input file " << param.input << endl; - exit(-1); - } - fin.close(); - } - - // parse charchart - if (param.charChart == "") { - cerr << "please specify CharChart file with --charChart= option" << endl; - exit(0); - } - CharChart* charChart = new CharChart(); - charChart->loadFromFile(param.charChart); - - try - { - cerr << "parse charChart file : " << param.charChart << endl; -// cerr << "TODO: to implement at "<<__FILE__<<", line "<<__LINE__<<"!" <setValidationScheme(SAXParser::Val_Auto); - // parser->setDoNamespaces(false); - // parser->setDoSchema(false); - // parser->setValidationSchemaFullChecking(false); - parser.setContentHandler(&keysLogger); - parser.setErrorHandler(&keysLogger); - QFile file(param.input.c_str()); - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Error opening " << param.input << std::endl; - return 1; - } - if (!parser.parse( QXmlInputSource(&file))) - { - std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; - return 1; - } - else - { - std::cerr << std::endl; - } - } - catch (const XMLException& toCatch) - { - std::cerr << "An error occurred Error: " << toCatch.getMessage() << endl; - throw; - } - fout.close(); - } else { - // compile dictionaries - - cerr << "parse property code file : " << param.propertyFile << endl; - PropertyCodeManager propcodemanager; - propcodemanager.readFromXmlFile(param.propertyFile); - - cerr << "parse symbolicCode file : " << param.symbolicCodes << endl; - map conversionMap; - propcodemanager.convertSymbolicCodes(param.symbolicCodes,conversionMap); - cerr << conversionMap.size() << " code read from symbolicCode file" << endl; -/* for (map::const_iterator it=conversionMap.begin(); - it!=conversionMap.end(); - it++) - { - cerr << it->first << " -> " << it->second << endl; - }*/ - - AbstractAccessByString* access(0); - if (param.fsaKey!="") { - cerr << "load fsa access method : " << param.fsaKey << endl; - FsaAccessSpare16* fsaAccess=new FsaAccessSpare16(); - fsaAccess->read(param.fsaKey); - access=fsaAccess; - } else { - cerr << "ERROR : no access Keys defined !" << endl; - exit(-1); - } - cerr << access->getSize() << " keys loaded" << endl; - - cerr << "parse input file : " << param.input << endl; - DictionaryCompiler handler(charChart,access,conversionMap,param.reverseKeys); - - QXmlSimpleReader parser; -// parser->setValidationScheme(SAXParser::Val_Auto); -// parser->setDoNamespaces(false); -// parser->setDoSchema(false); -// parser->setValidationSchemaFullChecking(false); - try - { - parser.setContentHandler(&handler); - parser.setErrorHandler(&handler); - QFile file(param.input.c_str()); - if (!file.open(QIODevice::ReadOnly)) - { - std::cerr << "Error opening " << param.input << std::endl; - return 1; - } - if (!parser.parse( QXmlInputSource(&file))) - { - std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; - return 1; - } - } - catch (const XMLException& toCatch) - { - cerr << "An error occurred Error: " << toCatch.getMessage() << endl; - throw; - } - - cerr << "write data to output file : " << param.output << endl; - ofstream fout(param.output.c_str(),ios::out | ios::binary); - if (!fout.good()) - { - cerr << "can't open file " << param.output << endl; - exit(-1); - } - handler.writeBinaryDictionary(fout); - fout.close(); - delete access; - } - return EXIT_SUCCESS; -} +/* + Copyright 2002-2013 CEA LIST + + This file is part of LIMA. + + LIMA is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LIMA is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with LIMA. If not, see +*/ + +#include +#include +#include +#include +#include + +#include "common/LimaCommon.h" +#include "common/Data/strwstrtools.h" +#include "common/Data/LimaString.h" +#include "linguisticProcessing/common/PropertyCode/PropertyCodeManager.h" +#include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/misc/AbstractAccessByString.h" +#include "linguisticProcessing/core/FlatTokenizer/CharChart.h" +// #include "linguisticProcessing/core/Tokenizer/ParseChar.h" +// #include "linguisticProcessing/core/Tokenizer/ParseCharClass.h" + +#include "KeysLogger.h" +#include "DictionaryHandler.h" + +#include +#include + +#ifdef ANTINNO_SPECIFIC +#include "common/AbstractFactoryPattern/antinno.LibraryLoader.class.h" +#endif + +using namespace std; +using namespace Lima; +using namespace Lima::Common; +using namespace Lima::Common::PropertyCode; +using namespace Lima::Common::FsaAccess; +using namespace Lima::Common::Misc; +using namespace Lima::LinguisticProcessing; +using namespace Lima::LinguisticProcessing::FlatTokenizer; + +void usage() +{ + std::cerr << "USAGE : compileDictionary [OPTIONS] file" << std::endl; + std::cerr << "where [OPTIONS] are : " << std::endl; + std::cerr << " --extractKeyList= : only extract keys list to file, no compilation" << endl; + std::cerr << " --charChart= : specify charchart file" << endl; + std::cerr << " --fsaKey= : provide fsa access keys to compile" << endl; + std::cerr << " --propertyFile= : specify property coding system (xml file)" << endl; + std::cerr << " --symbolicCodes= : specify symbolic codes file (xml)" << endl; + std::cerr << " --output= : specify output file" << endl; + std::cerr << " --reverse-keys : reverse entries keys" << endl; +} + +// options +typedef struct ParamStruct +{ + std::string extractKeys; + std::string charChart; + std::string fsaKey; + std::string propertyFile; + std::string symbolicCodes; + std::string output; + std::string input; + bool reverseKeys; +} +Param; + + +#include "common/tools/LimaMainTaskRunner.h" +#include "common/AbstractFactoryPattern/AmosePluginsManager.h" +#include + +int run(int aargc,char** aargv); + +int main(int argc, char **argv) +{ + QCoreApplication a(argc, argv); + + // Task parented to the application so that it + // will be deleted by the application. + LimaMainTaskRunner* task = new LimaMainTaskRunner(argc, argv, run, &a); + + // This will cause the application to exit when + // the task signals finished. + QObject::connect(task, SIGNAL(finished(int)), &a, SLOT(quit())); + + // This will run the task from the application event loop. + QTimer::singleShot(0, task, SLOT(run())); + + return a.exec(); + +} + + +int run(int argc,char** argv) +{ +#ifdef ANTINNO_SPECIFIC + + + { + + + ::std::string const configDir = ::std::getenv("AMOSE_CONF"); + if (configDir.empty()) + { + std::cerr << "No environment variable \"AMOSE_CONF\" set or variable is empty" << std::endl; + return EXIT_FAILURE; + } + + try + { + ::std::string const file = configDir + "/plugins.txt"; + Lima::antinno::LibraryLoader().loadFromFile(file); + } + catch (::std::exception const& ex) + { + std::cerr << "Exception during plugins loading. " << ex.what() << std::endl; + return EXIT_FAILURE; + } + + ::std::string const log4cppFilePath = configDir + "/log4cpp.properties"; + ::boost::shared_ptr pLog1(new QsLogging::antinno::Log4cpp()); + pLog1->configure(log4cppFilePath); + QsLogging::antinno::log = pLog1; + if (!QsLogging::Categories::instance().configure(log4cppFilePath.data())) + { + std::cerr << "Configure Problem " << log4cppFilePath << std::endl; + return EXIT_FAILURE; + } + + ::std::cout << "Plugins initialized" << ::std::endl; + } +#else + QsLogging::initQsLog(); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); +#endif + + setlocale(LC_ALL,"fr_FR.UTF-8"); + + Param param = { + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + std::string(""), + false}; + + + for (int i = 1 ; i < argc; i++) + { + std::string arg(argv[i]); + int pos = -1; + if (arg == "--help") + { + usage(); + return 0; + } + if ( (pos = arg.find("--extractKeyList=")) != -1 ) + { + param.extractKeys = arg.substr(pos+17); + } + else if ( (pos = arg.find("--fsaKey=")) != -1 ) + { + param.fsaKey = arg.substr(pos+9); + } + else if ( (pos = arg.find("--charChart=")) != -1 ) + { + param.charChart = arg.substr(pos+12); + } + else if ( (pos = arg.find("--propertyFile=")) != -1 ) + { + param.propertyFile = arg.substr(pos+15); + } + else if ( (pos = arg.find("--symbolicCodes=")) != -1 ) + { + param.symbolicCodes = arg.substr(pos+16); + } + else if ( (pos = arg.find("--output=")) != -1 ) + { + param.output = arg.substr(pos+9); + } + else if ( (pos = arg.find("--reverse-keys")) != -1 ) + { + param.reverseKeys = true; + } + else + { + param.input = arg; + } + } + + // check that input file exists + { + ifstream fin(param.input.c_str(), std::ifstream::binary); + if (!fin.good()) + { + cerr << "can't open input file " << param.input << endl; + exit(-1); + } + fin.close(); + } + + // parse charchart + if (param.charChart == "") { + cerr << "please specify CharChart file with --charChart= option" << endl; + exit(0); + } + CharChart* charChart = new CharChart(); + charChart->loadFromFile(param.charChart); + + try + { + cerr << "parse charChart file : " << param.charChart << endl; +// cerr << "TODO: to implement at "<<__FILE__<<", line "<<__LINE__<<"!" <setValidationScheme(SAXParser::Val_Auto); + // parser->setDoNamespaces(false); + // parser->setDoSchema(false); + // parser->setValidationSchemaFullChecking(false); + parser.setContentHandler(&keysLogger); + parser.setErrorHandler(&keysLogger); + QFile file(param.input.c_str()); + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Error opening " << param.input << std::endl; + return 1; + } + if (!parser.parse( QXmlInputSource(&file))) + { + std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; + return 1; + } + else + { + std::cerr << std::endl; + } + } + catch (const XMLException& toCatch) + { + std::cerr << "An error occurred Error: " << toCatch.getMessage() << endl; + throw; + } + fout.close(); + } else { + // compile dictionaries + + cerr << "parse property code file : " << param.propertyFile << endl; + PropertyCodeManager propcodemanager; + propcodemanager.readFromXmlFile(param.propertyFile); + + cerr << "parse symbolicCode file : " << param.symbolicCodes << endl; + map conversionMap; + propcodemanager.convertSymbolicCodes(param.symbolicCodes,conversionMap); + cerr << conversionMap.size() << " code read from symbolicCode file" << endl; +/* for (map::const_iterator it=conversionMap.begin(); + it!=conversionMap.end(); + it++) + { + cerr << it->first << " -> " << it->second << endl; + }*/ + + AbstractAccessByString* access(0); + if (param.fsaKey!="") { + cerr << "load fsa access method : " << param.fsaKey << endl; + FsaAccessSpare16* fsaAccess=new FsaAccessSpare16(); + fsaAccess->read(param.fsaKey); + access=fsaAccess; + } else { + cerr << "ERROR : no access Keys defined !" << endl; + exit(-1); + } + cerr << access->getSize() << " keys loaded" << endl; + + cerr << "parse input file : " << param.input << endl; + DictionaryCompiler handler(charChart,access,conversionMap,param.reverseKeys); + + QXmlSimpleReader parser; +// parser->setValidationScheme(SAXParser::Val_Auto); +// parser->setDoNamespaces(false); +// parser->setDoSchema(false); +// parser->setValidationSchemaFullChecking(false); + try + { + parser.setContentHandler(&handler); + parser.setErrorHandler(&handler); + QFile file(param.input.c_str()); + if (!file.open(QIODevice::ReadOnly)) + { + std::cerr << "Error opening " << param.input << std::endl; + return 1; + } + if (!parser.parse( QXmlInputSource(&file))) + { + std::cerr << "Error parsing " << param.input << " : " << parser.errorHandler()->errorString().toUtf8().constData() << std::endl; + return 1; + } + } + catch (const XMLException& toCatch) + { + cerr << "An error occurred Error: " << toCatch.getMessage() << endl; + throw; + } + + cerr << "write data to output file : " << param.output << endl; + ofstream fout(param.output.c_str(),ios::out | ios::binary); + if (!fout.good()) + { + cerr << "can't open file " << param.output << endl; + exit(-1); + } + handler.writeBinaryDictionary(fout); + fout.close(); + delete access; + } + return EXIT_SUCCESS; +} diff --git a/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp b/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp index e0c77c8ea..4309cc3ab 100644 --- a/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp +++ b/lima_linguisticprocessing/tools/dictionary/testComposedDict.cpp @@ -34,8 +34,10 @@ #include "common/XMLConfigurationFiles/xmlConfigurationFileExceptions.h" #include "common/MediaticData/mediaticData.h" #include "common/Data/LimaString.h" +#include "common/tools/FileUtils.h" #include "common/misc/fsaStringsPool.h" #include "common/FsaAccess/FsaAccessSpare16.h" +#include "common/FsaAccess/FsaAccessSpare16.h" #include "linguisticProcessing/core/AnalysisDict/AbstractAnalysisDictionary.h" #include "linguisticProcessing/core/AnalysisDict/EnhancedAnalysisDictionary.h" #include "DictionaryEntryLogger.h" @@ -64,6 +66,7 @@ typedef struct ParamStruct std::string defaultDataFileName; std::string key; std::string keyFile; + std::string limaConfigFile; int offset; bool superword; bool withDebug; @@ -120,6 +123,7 @@ int run(int argc,char** argv) std::string(""), std::string(""), std::string(""), + std::string(""), -1, false, false @@ -138,6 +142,10 @@ int run(int argc,char** argv) { param.language = arg.substr(pos+11); } + else if ( (pos = arg.find("--limaConfigFile=")) != std::string::npos ) + { + param.limaConfigFile = arg.substr(pos+17); + } else if ( (pos = arg.find("--dicoId=")) != std::string::npos ) { param.dicoId = arg.substr(pos+9); @@ -182,8 +190,8 @@ int run(int argc,char** argv) } - std::string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - std::string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + std::string resourcesPath=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + std::string configDir=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); std::string commonConfigFile="/lima-common.xml"; deque langs; langs.push_back(param.language); @@ -199,17 +207,21 @@ int run(int argc,char** argv) cout << " --dicoId='" << param.dicoId << "'" << endl; - string configPath=Common::MediaticData::MediaticData::single().getConfigPath(); + QString configPath=QString::fromUtf8(Common::MediaticData::MediaticData::single().getConfigPath().c_str()); cout << "load language " << param.language << endl; MediaId langid=MediaticData::single().getMediaId(param.language); - string file; + QString file; try { - Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configPath + "/lima-analysis.xml"); - file=configPath + "/" + configuration.getModuleGroupParamValue( + QString configurationFile = Common::Misc::findFileInPaths(configPath, QString::fromUtf8("lima-analysis.xml")); + if (! param.limaConfigFile.empty()) { + configurationFile=QString::fromUtf8(param.limaConfigFile.c_str()); + } + Common::XMLConfigurationFiles::XMLConfigurationFileParser configuration(configurationFile.toUtf8().constData()); + file = Common::Misc::findFileInPaths(configPath, QString::fromUtf8( configuration.getModuleGroupParamValue( "lima-coreclient", "mediaProcessingDefinitionFiles", - param.language); + param.language).c_str() ) ); } catch (NoSuchParam& ) { @@ -217,7 +229,7 @@ int run(int argc,char** argv) throw InvalidConfiguration(); } - XMLConfigurationFileParser langParser(file); + XMLConfigurationFileParser langParser(file.toUtf8().constData()); // initialize resources try diff --git a/lima_linguisticprocessing/tools/normalize/desaccent.cpp b/lima_linguisticprocessing/tools/normalize/desaccent.cpp index 5153bf129..ffcf4a8c7 100644 --- a/lima_linguisticprocessing/tools/normalize/desaccent.cpp +++ b/lima_linguisticprocessing/tools/normalize/desaccent.cpp @@ -187,7 +187,7 @@ int run(int argc,char** argv) string line; if (fin.good()) { - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); while (fin.good() && !fin.eof() && line!="") { LimaString str=utf8stdstring2limastring(line); @@ -201,7 +201,7 @@ int run(int argc,char** argv) res = charChart->unmark(str); } cout << limastring2utf8stdstring(res) << endl; - getline(fin,line); + line = Lima::Common::Misc::readLine(fin); } } } diff --git a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp index 2e6afdc7e..1f7e52dc6 100644 --- a/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp +++ b/lima_linguisticprocessing/tools/normalize/normalizeTerm.cpp @@ -22,6 +22,7 @@ ***************************************************************************/ #include "common/LimaCommon.h" +#include "common/tools/FileUtils.h" #include "common/tools/LimaMainTaskRunner.h" #include "common/MediaticData/mediaticData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" @@ -89,10 +90,6 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); - // Necessary to initialize factories - Lima::AmosePluginsManager::single(); - bool docatch = false; if (argc>1) { @@ -125,12 +122,10 @@ int run(int argc,char** argv) return dowork(argc,argv); } - int dowork(int argc,char* argv[]) { - - string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); - string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); + string resourcesPathParam=qgetenv("LIMA_RESOURCES").isEmpty()?"/usr/share/apps/lima/resources":string(qgetenv("LIMA_RESOURCES").constData()); + string configPathParam=qgetenv("LIMA_CONF").isEmpty()?"/usr/share/config/lima":string(qgetenv("LIMA_CONF").constData()); string lpConfigFile=string("lima-analysis.xml"); string commonConfigFile=string("lima-common.xml"); string pipeline=string("normalization"); @@ -161,9 +156,9 @@ int dowork(int argc,char* argv[]) else if ( (pos = arg.find("--common-config-file=")) != std::string::npos ) commonConfigFile = arg.substr(pos+21); else if ( (pos = arg.find("--config-dir=")) != std::string::npos ) - configDir = arg.substr(pos+13); + configPathParam = arg.substr(pos+13); else if ( (pos = arg.find("--resources-dir=")) != std::string::npos ) - resourcesPath = arg.substr(pos+16); + resourcesPathParam = arg.substr(pos+16); else if ( (pos = arg.find("--language=")) != std::string::npos ) langs.push_back(arg.substr(pos+11)); // else if ( (pos = arg.find("--pipeline=")) != std::string::npos ) @@ -185,29 +180,50 @@ int dowork(int argc,char* argv[]) return -1; } - AbstractLinguisticProcessingClient* client(0); + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + if (!configPathParam.empty()) + { + configPath = QString::fromUtf8(configPathParam.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + if (!resourcesPathParam.empty()) + { + resourcesPath = QString::fromUtf8(resourcesPathParam.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } + + QsLogging::initQsLog(configPath); + // Necessary to initialize factories + Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); try { // initialize common MediaticData::changeable().init( - resourcesPath, - configDir, + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); // initialize linguistic processing deque pipelines; pipelines.push_back(pipeline); - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); + + QString lpConfigFileFound = Common::Misc::findFileInPaths(configPath, lpConfigFile.c_str(), LIMA_PATH_SEPARATOR); + + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(lpConfigFileFound.toUtf8().constData()); LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); + clientId, + lpconfig, + langs, + pipelines); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + shared_ptr client= std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -233,7 +249,7 @@ int dowork(int argc,char* argv[]) char buf[256]; file.getline(buf,256); std::string line(buf); - while (!file.eof()) + while (file.good()) { if (line.size()==0) { @@ -248,13 +264,12 @@ int dowork(int argc,char* argv[]) // analyze it metaData["FileName"]=*fileItr; - - // Lima::TimeUtilsController *timer = new Lima::TimeUtilsController("test",true); + + // Lima::TimeUtilsController *timer = new Lima::TimeUtilsController("test",true); client->analyze(contentText,metaData,pipeline,handlers); - // delete timer; - - - + // delete timer; + + // analyze resulting bowText to extract normalization multimap norms=extractNormalization(contentText,bowTextHandler.getBowText(),lang); if (norms.empty()) @@ -282,7 +297,6 @@ int dowork(int argc,char* argv[]) throw e; } - delete client; return SUCCESS_ID; } @@ -305,7 +319,7 @@ multimap extractNormalization(const LimaString& source,const bowItr!=bowText.end(); bowItr++) { - if ((*bowItr)->getType() != BOW_PREDICATE) + if ((*bowItr)->getType() != BoWType::BOW_PREDICATE) { pair posLen=getStartEnd(static_cast(&**bowItr)); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; @@ -326,7 +340,7 @@ multimap extractNormalization(const LimaString& source,const pair getStartEnd(const BoWToken* tok) { pair res; - if (tok->getType()==BOW_TOKEN) + if (tok->getType()==BoWType::BOW_TOKEN) { res.first=tok->getPosition(); res.second=tok->getPosition()+tok->getLength(); diff --git a/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp b/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp index 1c5d5ca25..1dbd35d0a 100644 --- a/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp +++ b/lima_linguisticprocessing/tools/tva/AnalysisTestCase.cpp @@ -89,7 +89,7 @@ TestCaseError AnalysisTestCaseProcessor::processTestCase(const Lima::Common::TGV ofstream fout(outputfile.c_str(), std::ofstream::binary); fout << "" << endl; Common::BagOfWords::BoWXMLWriter writer(fout); - writer.writeBoWText(&text, true); + writer.writeBoWText(&text, true, false); fout.close(); TestCaseError error = evalTestCase( testCase, *pipItr, filename, filenameWithPipeLine ); diff --git a/lima_linguisticprocessing/tools/tva/tva.cpp b/lima_linguisticprocessing/tools/tva/tva.cpp index 444edcd5b..d9f30dc3e 100644 --- a/lima_linguisticprocessing/tools/tva/tva.cpp +++ b/lima_linguisticprocessing/tools/tva/tva.cpp @@ -32,6 +32,7 @@ #include "common/MediaticData/mediaticData.h" #include "common/XMLConfigurationFiles/xmlConfigurationFileParser.h" #include "common/Handler/AbstractAnalysisHandler.h" +#include "common/tools/FileUtils.h" #include "linguisticProcessing/client/AbstractLinguisticProcessingClient.h" #include "linguisticProcessing/client/LinguisticProcessingClientFactory.h" @@ -45,6 +46,7 @@ #include "common/AbstractFactoryPattern/AmosePluginsManager.h" // #endif +using namespace Lima::Common::Misc; using namespace Lima::Common::TGV; using namespace Lima::AnalysisValidation; using namespace Lima::LinguisticProcessing; @@ -82,12 +84,19 @@ int main(int argc, char **argv) int run(int argc,char** argv) { - QsLogging::initQsLog(); + QStringList configDirs = buildConfigurationDirectoriesList(QStringList() << "lima",QStringList()); + QString configPath = configDirs.join(LIMA_PATH_SEPARATOR); + + QStringList resourcesDirs = buildResourcesDirectoriesList(QStringList() << "lima",QStringList()); + QString resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR); + + QsLogging::initQsLog(configPath); // Necessary to initialize factories Lima::AmosePluginsManager::single(); + Lima::AmosePluginsManager::changeable().loadPlugins(configPath); - std::string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":std::string(getenv("LIMA_RESOURCES")); - std::string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":std::string(getenv("LIMA_CONF")); + std::string strConfigPath; + std::string strResourcesPath; std::string lpConfigFile=std::string("lima-lp-tva.xml"); std::string commonConfigFile=std::string("lima-common.xml"); std::string clientId=std::string("lima-coreclient"); @@ -112,9 +121,9 @@ int run(int argc,char** argv) else if ( (pos = arg.find("--common-config-file=")) != std::string::npos ) commonConfigFile = arg.substr(pos+21); else if ( (pos = arg.find("--config-dir=")) != std::string::npos ) - configDir = arg.substr(pos+13); + strConfigPath = arg.substr(pos+13); else if ( (pos = arg.find("--resources-dir=")) != std::string::npos ) - resourcesPath = arg.substr(pos+16); + strResourcesPath = arg.substr(pos+16); else if ( (pos = arg.find("--client=")) != std::string::npos ) clientId=arg.substr(pos+9); else if ( (pos = arg.find("--working-dir=")) != std::string::npos ) @@ -134,27 +143,49 @@ int run(int argc,char** argv) std::cerr << "No language specified. Aborting." << std::endl; return 1; } + if (!strResourcesPath.empty()) + { + resourcesPath = QString::fromUtf8(strResourcesPath.c_str()); + resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR); + } + if (!strConfigPath.empty()) + { + configPath = QString::fromUtf8(strConfigPath.c_str()); + configDirs = configPath.split(LIMA_PATH_SEPARATOR); + } setlocale(LC_ALL,"fr_FR.UTF-8"); - AbstractLinguisticProcessingClient* client(0); - // initialize common - MediaticData::changeable().init( - resourcesPath, - configDir, + Common::MediaticData::MediaticData::changeable().init( + resourcesPath.toUtf8().constData(), + configPath.toUtf8().constData(), commonConfigFile, langs); - - // initialize linguistic processing - Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile); - LinguisticProcessingClientFactory::changeable().configureClientFactory( - clientId, - lpconfig, - langs, - pipelines); - - client=static_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + + bool clientFactoryConfigured = false; + Q_FOREACH(QString configDir, configDirs) + { + if (QFileInfo(configDir + "/" + lpConfigFile.c_str()).exists()) + { + // initialize linguistic processing + Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig((configDir + "/" + lpConfigFile.c_str()).toStdString()); + LinguisticProcessingClientFactory::changeable().configureClientFactory( + clientId, + lpconfig, + langs, + pipelines); + clientFactoryConfigured = true; + break; + } + } + if(!clientFactoryConfigured) + { + std::cerr << "No LinguisticProcessingClientFactory were configured with" << configDirs.join(LIMA_PATH_SEPARATOR).toStdString() << "and" << lpConfigFile << std::endl; + return EXIT_FAILURE; + } + + std::shared_ptr< AbstractLinguisticProcessingClient > client = std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); // Set the handlers std::map handlers; @@ -165,7 +196,7 @@ int run(int argc,char** argv) BowTextHandler* bowTextHandler = new BowTextHandler(); handlers.insert(std::make_pair("bowTextHandler", bowTextHandler)); - AnalysisTestCaseProcessor analysisTestCaseProcessor(workingDir, client, handlers); + AnalysisTestCaseProcessor analysisTestCaseProcessor(workingDir, client.get(), handlers); QXmlSimpleReader parser; TestCasesHandler tch(analysisTestCaseProcessor); @@ -234,7 +265,6 @@ int run(int argc,char** argv) std::cout << std::endl; tch.m_reportByType.clear(); } - delete client; delete bowTextWriter; delete simpleStreamHandler; delete bowTextHandler; diff --git a/lima_linguisticprocessing/tools/tvr/tvr.cpp b/lima_linguisticprocessing/tools/tvr/tvr.cpp index 0de47839b..25e31e2ae 100644 --- a/lima_linguisticprocessing/tools/tvr/tvr.cpp +++ b/lima_linguisticprocessing/tools/tvr/tvr.cpp @@ -145,8 +145,6 @@ int run(int argc,char** argv) setlocale(LC_ALL,"fr_FR.UTF-8"); - AbstractLinguisticProcessingClient* client(0); - // initialize common MediaticData::changeable().init( resourcesPath, @@ -161,11 +159,11 @@ int run(int argc,char** argv) lpconfig, MediaticData::single().getMedias()); - client=dynamic_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); + std::shared_ptr client=std::dynamic_pointer_cast(LinguisticProcessingClientFactory::single().createClient(clientId)); ReaderTestCaseProcessor - readerTestCaseProcessor(workingDir, client); + readerTestCaseProcessor(workingDir, client.get()); QXmlSimpleReader parser; TestCasesHandler tch(readerTestCaseProcessor); diff --git a/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp b/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp index 1c6f41239..d1baca566 100644 --- a/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp +++ b/lima_pelf/benchmarkingTool/BenchmarkingTool.cpp @@ -128,7 +128,7 @@ void BenchmarkingTool::init () pipeGraphsSplitter->restoreState(settings->value ("pipeGraphsSplitter").toByteArray()); QStringList textFiles = settings->value ("textFiles").toStringList(); - foreach (QString textFile, textFiles) + Q_FOREACH (QString textFile, textFiles) { recentFilesList->addItem(textFile); } @@ -162,11 +162,11 @@ void BenchmarkingTool::updateErrorsWidget() QMultiMap errors = utterancesWithErrors(selectedUnitTextPath); // qDebug() << "BenchmarkingTool::updateErrorsWidget utterancesWithErrors:" << errors.size(); - foreach (const QString& key, errors.keys().toSet()) + Q_FOREACH (const QString& key, errors.keys().toSet()) { QList list = errors.values(key); qSort(list); - foreach(const QString& value, list) + Q_FOREACH(const QString& value, list) { // qDebug() << "add child item" << key << value; QTreeWidgetItem * childItem = new QTreeWidgetItem(); @@ -204,7 +204,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co BenchmarkingResult* benchResult = (pipeline->results)[pipeline->startTime]; QMap& puResult = benchResult->resultUnits; qDebug() << "BenchmarkingTool::slotErrorStatementActivated " << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { // qDebug() << "selectedUnitTextPath: " << selectedUnitTextPath << "; pipelineName: " << pipelineName; if (pipelineName != pipelineUnit->name) continue; @@ -227,7 +227,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co QList previousErrors = getErrors(pipelineName, statementId, m_previousBenchmarkingResult); qDebug() << "previousErrors" << pipelineName << statementId << ":" << previousErrors; - foreach (const QStringList& list, errors) + Q_FOREACH (const QStringList& list, errors) { if (!previousErrors.contains(list)) { @@ -238,7 +238,7 @@ void BenchmarkingTool::slotErrorStatementActivated(QTreeWidgetItem* item, int co } } - foreach (const QStringList& list, previousErrors) + Q_FOREACH (const QStringList& list, previousErrors) { if (!errors.contains(list)) { @@ -328,7 +328,7 @@ void BenchmarkingTool::resetEvaluationCurves () // recallQwtPlot->clear(); evaluationResultTypeQwtCurves.clear(); QMap& dimensions = EvaluationResult::getDimensions(); - foreach(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) // dimensionsIt = dimensions.begin(); dimensionsIt != dimensions.end(); dimensionsIt++) { EvaluationResultDimension* dimension = dimensions[dimensionId]; @@ -382,7 +382,7 @@ void BenchmarkingTool::updateDimensionsWidgets () QCheckBox* dimensionCheckBox; QLabel* dimensionLabel; QMap& dimensions = EvaluationResult::getDimensions(); - foreach(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, dimensions.keys()) { EvaluationResultDimension* dimension = dimensions[dimensionId]; dimensionCheckBox = new QCheckBox(this); @@ -459,7 +459,7 @@ void BenchmarkingTool::pipelineUnitsChanged () pipelineUnitDisplayCb->addItem("All pipeline units"); const QList& pipelineUnits = pipeline->getUnits(); int unitId = 0; - foreach(PipelineUnit* unit, pipelineUnits) + Q_FOREACH(PipelineUnit* unit, pipelineUnits) { pipelineUnitDisplayCb->addItem(unit->name, unitId); unitId++; @@ -477,17 +477,17 @@ void BenchmarkingTool::updateResultsViews() if(pipelineUnitDisplayCb->currentIndex() > 0) selectedUnitTextPath = pipeline->getUnits()[pipelineUnitDisplayCb->currentIndex() - 1]->textPath; ResultsModel::selectedUnitTextPath = selectedUnitTextPath; - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { EvaluationResult::DIMENSION_ID dimensionId = (EvaluationResult::DIMENSION_ID)dimension->id; int noResults = 1; double xFmeasure[nbRes], yFmeasure[nbRes], xPrecision[nbRes], yPrecision[nbRes], xRecall[nbRes], yRecall[nbRes]; - foreach (BenchmarkingResult* result, pipeline->results) + Q_FOREACH (BenchmarkingResult* result, pipeline->results) { QMap& puResult = result->resultUnits; int nbPus = 0; double sumFc = 0, sumFp = 0, sumCr = 0; - foreach (PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH (PipelineUnit* pipelineUnit, puResult.keys()) { if(selectedUnitTextPath.isEmpty() || selectedUnitTextPath == pipelineUnit->textPath) { @@ -927,7 +927,7 @@ void BenchmarkingTool::slotTextFileActivated(QListWidgetItem* item) void BenchmarkingTool::slotRemoveTextFile() { QList items = recentFilesList->selectedItems(); - foreach(QListWidgetItem* item, items) + Q_FOREACH(QListWidgetItem* item, items) { recentFilesList->takeItem(recentFilesList->row(item)); } @@ -985,7 +985,7 @@ void BenchmarkingTool::compareWith(const QString& otherFilename) qDebug() << "BenchmarkingTool::compareWith"; QMultiMap utterancesSet = utterancesWithErrors(selectedUnitTextPath); QString utterances; - foreach (const QString& utt, utterancesSet.values(selectedUnitTextPath)) + Q_FOREACH (const QString& utt, utterancesSet.values(selectedUnitTextPath)) { utterances += utt + ","; } @@ -1043,7 +1043,7 @@ QMultiMap BenchmarkingTool::utterancesWithErrors(BenchmarkingRe // words if it is not the last one QMap puResult = benchmarkingResult->resultUnits; // qDebug() << "BenchmarkingTool::utterancesWithErrors " << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { QString pipelineName = pipelineUnit->name; // qDebug() << "selectedUnitTextPath: " << selectedUnitTextPath << "; pipelineName: " << pipelineName; @@ -1065,7 +1065,7 @@ QMultiMap BenchmarkingTool::utterancesWithErrors(BenchmarkingRe keys.unite(QSet::fromList(fals.keys())); keys.unite(QSet::fromList(type.keys())); // qDebug() << "utterances insert" << pipelineName << keys; - foreach(const QString& key, keys) + Q_FOREACH(const QString& key, keys) { if (!utterances.values(pipelineName).contains(key)) utterances.insert(pipelineName,key); @@ -1123,10 +1123,10 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult // qDebug() << "BenchmarkingTool::updateErrorsWidget (compare) AFTER UTTERANCES WITH ERRORS"; QMap alreadyInserted; - foreach (const QString& key, errorUtterances.keys().toSet()) + Q_FOREACH (const QString& key, errorUtterances.keys().toSet()) { // qDebug() << "key" << key; - foreach(const QString& value, errorUtterances.values(key)) + Q_FOREACH(const QString& value, errorUtterances.values(key)) { // qDebug() << "value" << value; QList errors = getErrors(key, value, benchmarkingResult); @@ -1134,7 +1134,7 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult QList previousErrors = getErrors(key, value, previousBenchmarkingResult); qDebug() << "previousErrors" << key << value << ":" << previousErrors; - foreach (const QStringList& list, errors) + Q_FOREACH (const QStringList& list, errors) { if (!previousErrors.contains(list) && !(alreadyInserted.contains(key) && alreadyInserted.values(key).contains(value))) { @@ -1147,7 +1147,7 @@ void BenchmarkingTool::updateErrorsWidget(BenchmarkingResult* benchmarkingResult alreadyInserted[key] = value; } } - foreach (const QStringList& list, previousErrors) + Q_FOREACH (const QStringList& list, previousErrors) { if (!errors.contains(list) && !(alreadyInserted.contains(key) && alreadyInserted.values(key).contains(value))) { @@ -1173,7 +1173,7 @@ QList BenchmarkingTool::getErrors(const QString& unit, const QStrin QMap& puResult = benchResult->resultUnits; qDebug() << "BenchmarkingTool::getErrors" << puResult.size() << " pu results"; - foreach(PipelineUnit* pipelineUnit, puResult.keys()) + Q_FOREACH(PipelineUnit* pipelineUnit, puResult.keys()) { QString pipelineName = pipelineUnit->name; qDebug() << "unit: " << unit << "; pipelineName: " << pipelineName; diff --git a/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp b/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp index d0f2aaa75..10020bf08 100644 --- a/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp +++ b/lima_pelf/benchmarkingTool/BenchmarkingXmlWriter.cpp @@ -45,7 +45,7 @@ bool BenchmarkingXmlWriter::write() writeStartElement("pipeline"); int unitId = 0; const QList& pipelineUnits = m_pipeline->getUnits(); - foreach (PipelineUnit* unit, pipelineUnits) + Q_FOREACH (PipelineUnit* unit, pipelineUnits) { writeStartElement("unit"); unit->fileId = ++unitId; @@ -59,20 +59,20 @@ bool BenchmarkingXmlWriter::write() writeEndElement(); writeStartElement("evaluations"); - foreach (BenchmarkingResult* benchmarkingResult, m_pipeline->results) + Q_FOREACH (BenchmarkingResult* benchmarkingResult, m_pipeline->results) { writeStartElement("pipelineEvaluation"); QDateTime evaluationTime = benchmarkingResult->time; writeAttribute("time", QString::number(evaluationTime.toTime_t())); writeAttribute("comments", benchmarkingResult->comment); - foreach (PipelineUnit* unit, pipelineUnits) + Q_FOREACH (PipelineUnit* unit, pipelineUnits) { if(benchmarkingResult->resultUnits.contains(unit)) { writeStartElement("unitEvaluation"); writeAttribute("unitId", QString::number(unit->fileId)); EvaluationResultSet* unitResults = benchmarkingResult->resultUnits[unit]; - foreach(EvaluationResult::DIMENSION_ID dimensionId, unitResults->keys()) + Q_FOREACH(EvaluationResult::DIMENSION_ID dimensionId, unitResults->keys()) { EvaluationResult* result = (*unitResults)[dimensionId]; writeStartElement("result"); diff --git a/lima_pelf/benchmarkingTool/CommentEditDlg.h b/lima_pelf/benchmarkingTool/CommentEditDlg.h index 31d6ea3dc..8148992f2 100644 --- a/lima_pelf/benchmarkingTool/CommentEditDlg.h +++ b/lima_pelf/benchmarkingTool/CommentEditDlg.h @@ -40,7 +40,7 @@ Q_OBJECT CommentEditDlg (QWidget* parent = 0); void init (BenchmarkingResult* br, Pipeline* p = 0); -public slots: +public Q_SLOTS: void submit (); diff --git a/lima_pelf/benchmarkingTool/EvaluationResult.cpp b/lima_pelf/benchmarkingTool/EvaluationResult.cpp index ab445ae91..63d11d00e 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResult.cpp +++ b/lima_pelf/benchmarkingTool/EvaluationResult.cpp @@ -195,12 +195,12 @@ QList EvaluationResult::getDimensionsVisible () { QList dimensionsVisible; bool masked = false; - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { if(dimension->visibilityState == Qt::PartiallyChecked) masked = true; } - foreach(EvaluationResultDimension* dimension, dimensions) + Q_FOREACH(EvaluationResultDimension* dimension, dimensions) { int dimensionGroupId = getDimensionGroup((DIMENSION_ID)dimension->id); bool groupChecked = dimensionGroupId == -1 || dimensions[(DIMENSION_ID)dimensionGroupId]->visibilityState == Qt::Checked; diff --git a/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp b/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp index 0624d20b0..1808d9e66 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp +++ b/lima_pelf/benchmarkingTool/EvaluationResultDimension.cpp @@ -44,7 +44,7 @@ EvaluationResultDimension::~EvaluationResultDimension() void EvaluationResultDimension::updateVisibleChanged (int state) { visibilityState = (Qt::CheckState)state; - emit visibleChanged(); + Q_EMIT visibleChanged(); } #include "EvaluationResultDimension.moc" diff --git a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h index ebb7341ae..1d0e42486 100644 --- a/lima_pelf/benchmarkingTool/EvaluationResultDimension.h +++ b/lima_pelf/benchmarkingTool/EvaluationResultDimension.h @@ -47,10 +47,10 @@ Q_OBJECT EvaluationResultDimension (QString n, QString k, QColor c, int i, bool r); -signals: +Q_SIGNALS: void visibleChanged (); -public slots: +public Q_SLOTS: void updateVisibleChanged (int state); diff --git a/lima_pelf/benchmarkingTool/Pipeline.cpp b/lima_pelf/benchmarkingTool/Pipeline.cpp index c8eb88528..a3b7e040f 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.cpp +++ b/lima_pelf/benchmarkingTool/Pipeline.cpp @@ -121,7 +121,7 @@ void Pipeline::moveUnits (QModelIndexList sourceIndexes, QModelIndex targetIndex return; qDebug() << "Reordering pipeline files"; QList::const_iterator sourceIndexIt; - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); qSort(sourceIndexes); QList movedUnits; int shiftSourceIndexes = 0, shiftTargetIndex = 0, targetIndexRow = targetIndex.row(); @@ -133,12 +133,12 @@ void Pipeline::moveUnits (QModelIndexList sourceIndexes, QModelIndex targetIndex if(sourceIndexRow < targetIndexRow) shiftTargetIndex = shiftSourceIndexes; } - foreach(PipelineUnit* movedUnit, movedUnits) + Q_FOREACH(PipelineUnit* movedUnit, movedUnits) { units.insert(units.begin() + targetIndex.row() + shiftTargetIndex, movedUnit); shiftTargetIndex++; } - emit layoutChanged(); + Q_EMIT layoutChanged(); pipelineView->clearSelection(); unitsUpdate(); setDirty(); @@ -162,12 +162,12 @@ void Pipeline::deleteUnits (QModelIndexList unitIndexes) { qDebug() << "Removing pipeline files"; QList::const_iterator unitIndexesIt; - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); qSort(unitIndexes); int shiftIndexes = 0; for(unitIndexesIt = unitIndexes.constBegin(); unitIndexesIt != unitIndexes.constEnd(); unitIndexesIt++) shiftIndexes = deleteUnit(*unitIndexesIt, shiftIndexes); - emit layoutChanged(); + Q_EMIT layoutChanged(); setDirty(); pipelineView->clearSelection(); } @@ -190,12 +190,12 @@ int Pipeline::deleteUnit (QModelIndex unitIndex, int shiftIndexes) void Pipeline::clearUnits () { - emit layoutAboutToBeChanged(); + Q_EMIT layoutAboutToBeChanged(); QList::iterator unitsIt = units.begin(); for(; unitsIt < units.end(); unitsIt++) delete (*unitsIt); units.clear(); - emit layoutChanged(); + Q_EMIT layoutChanged(); pipelineView->clearSelection(); unitsUpdate(); setDirty(); @@ -225,7 +225,7 @@ bool Pipeline::resetBenchmarking () for(; unitsIt < units.end(); unitsIt++) (*unitsIt)->status = PipelineUnit::STATUS_UNPROCESSED; pipelineView->reset(); - emit resultsChanged(); + Q_EMIT resultsChanged(); qDebug() << "Reseted pipeline files processing states"; processing = false; return true; @@ -318,14 +318,14 @@ void Pipeline::continueBenchmarking () for(; unitsIt != units.end(); unitsIt++) if((*unitsIt)->status == PipelineUnit::STATUS_PROCESSED) (*unitsIt)->status = PipelineUnit::STATUS_UNPROCESSED; - emit finishedBenchmarking(); + Q_EMIT finishedBenchmarking(); } } void Pipeline::unitsUpdate () { QMap::iterator resultsIt; - foreach (BenchmarkingResult* result, results) + Q_FOREACH (BenchmarkingResult* result, results) { QMap newResultUnits; QList::iterator unitsIt = units.begin(); @@ -334,7 +334,7 @@ void Pipeline::unitsUpdate () newResultUnits[*unitsIt] = result->resultUnits[*unitsIt]; result->resultUnits = newResultUnits; } - emit unitsChanged(); + Q_EMIT unitsChanged(); } void Pipeline::unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers) @@ -354,7 +354,7 @@ void Pipeline::unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers) qDebug() << "Pipeline::unitResultsChanged startTime"<reset(); - emit resultsChanged(); + Q_EMIT resultsChanged(); if(processing) continueBenchmarking(); } diff --git a/lima_pelf/benchmarkingTool/Pipeline.h b/lima_pelf/benchmarkingTool/Pipeline.h index 427d2c0df..e313302cd 100644 --- a/lima_pelf/benchmarkingTool/Pipeline.h +++ b/lima_pelf/benchmarkingTool/Pipeline.h @@ -81,11 +81,11 @@ Q_OBJECT Qt::DropActions supportedDropActions() const; Qt::ItemFlags flags(const QModelIndex &index) const; -public slots: +public Q_SLOTS: void unitResultsChanged (PipelineUnit* pu, EvaluationResultSet* ers); -signals: +Q_SIGNALS: void unitsChanged (); void resultsChanged (); diff --git a/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h b/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h index 8f32b6187..6f1af5c2a 100644 --- a/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h +++ b/lima_pelf/benchmarkingTool/PipelineConfigureDlg.h @@ -43,7 +43,7 @@ Q_OBJECT PipelineConfigureDlg (QWidget* parent = 0); void init (Pipeline* p, QString workingDir, QString analyzerCmd, QString evaluatorCmd, int cp); -public slots: +public Q_SLOTS: void workingDirBrowse (); void analyzerCmdBrowse (); diff --git a/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h b/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h index e0de3ab28..ccb6ba46d 100644 --- a/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h +++ b/lima_pelf/benchmarkingTool/PipelineEditFileDlg.h @@ -44,7 +44,7 @@ Q_OBJECT void init (Pipeline* bp); void setUnit (PipelineUnit* unit, int index); -public slots: +public Q_SLOTS: void textFileBrowse (); void referenceFileBrowse (); diff --git a/lima_pelf/benchmarkingTool/PipelineUnit.cpp b/lima_pelf/benchmarkingTool/PipelineUnit.cpp index 2962b3e71..458ca62a1 100644 --- a/lima_pelf/benchmarkingTool/PipelineUnit.cpp +++ b/lima_pelf/benchmarkingTool/PipelineUnit.cpp @@ -108,7 +108,7 @@ void PipelineUnit::commandFinished (int exitCode, QProcess::ExitStatus exitStatu resultSet->findEvaluationResults(output); // qDebug() << "Finished benchmarking file " << name << ", evaluation results found, processing results output"; status = STATUS_PROCESSED; - emit unitResultsChanged(this, resultSet); + Q_EMIT unitResultsChanged(this, resultSet); } } diff --git a/lima_pelf/benchmarkingTool/PipelineWidget.h b/lima_pelf/benchmarkingTool/PipelineWidget.h index 140352b56..29176463f 100644 --- a/lima_pelf/benchmarkingTool/PipelineWidget.h +++ b/lima_pelf/benchmarkingTool/PipelineWidget.h @@ -48,7 +48,7 @@ Q_OBJECT void keyPressEvent (QKeyEvent* event); void contextMenuEvent (QContextMenuEvent* event); -public slots: +public Q_SLOTS: void editUnit (const QModelIndex& index); void contextEdit (); diff --git a/lima_pelf/benchmarkingTool/ResultsWidget.cpp b/lima_pelf/benchmarkingTool/ResultsWidget.cpp index 938668af7..c80d87d0c 100644 --- a/lima_pelf/benchmarkingTool/ResultsWidget.cpp +++ b/lima_pelf/benchmarkingTool/ResultsWidget.cpp @@ -114,7 +114,7 @@ void ResultsWidget::contextDelete () ) == QMessageBox::Ok) { pipeline->results.remove(benchmarkingResultTime); - emit resultsChanged(); + Q_EMIT resultsChanged(); } } @@ -122,7 +122,7 @@ void ResultsWidget::contextView () { if(selectedIndexes().size() <= 0) return; - emit viewResult(selectedIndexes().first().row()); + Q_EMIT viewResult(selectedIndexes().first().row()); } diff --git a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp index 03f166bfe..b82ebcabd 100644 --- a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp +++ b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.cpp @@ -74,7 +74,7 @@ void DictionnaryEntryEditDlg::submit () dictionnaryEntry->normalization = normalizationInp->text(); dictionnaryEntry->category = categoryCb->itemText(categoryCb->currentIndex()); dictionnaryEntry->displayable = true; - emit updateEntry(dictionnaryEntry); + Q_EMIT updateEntry(dictionnaryEntry); } #include "DictionnaryEntryEditDlg.moc" diff --git a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h index 6094d75f8..f6ab07fb3 100644 --- a/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h +++ b/lima_pelf/resourceTool/DictionnaryEntryEditDlg.h @@ -41,10 +41,10 @@ Q_OBJECT DictionnaryEntryEditDlg (QWidget* parent = 0); void init (ResourceEditorTableModel* rem, AbstractResourceEntry* are = 0); -signals: +Q_SIGNALS: void updateEntry (AbstractResourceEntry* de); -public slots: +public Q_SLOTS: void submit (); void checkValidity(); diff --git a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp index fd7984f80..3c305f51b 100644 --- a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp +++ b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.cpp @@ -87,7 +87,7 @@ void IdiomaticExpressionEntryEditDlg::submit () idiomaticExpressionEntry->lemma = lemmaInp->text(); idiomaticExpressionEntry->contextual = contextualCb->itemText(contextualCb->currentIndex()); idiomaticExpressionEntry->displayable = true; - emit updateEntry(idiomaticExpressionEntry); + Q_EMIT updateEntry(idiomaticExpressionEntry); } #include "IdiomaticExpressionEntryEditDlg.moc" diff --git a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h index 628630c4c..646756487 100644 --- a/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h +++ b/lima_pelf/resourceTool/IdiomaticExpressionEntryEditDlg.h @@ -41,10 +41,10 @@ Q_OBJECT IdiomaticExpressionEntryEditDlg (QWidget* parent = 0); void init (ResourceEditorTableModel* rem, AbstractResourceEntry* are = 0); -signals: +Q_SIGNALS: void updateEntry (AbstractResourceEntry* de); -public slots: +public Q_SLOTS: void submit (); void checkValidity(); diff --git a/lima_pelf/resourceTool/ResourceEditorTableModel.cpp b/lima_pelf/resourceTool/ResourceEditorTableModel.cpp index 17e54d515..eea78c1fc 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableModel.cpp +++ b/lima_pelf/resourceTool/ResourceEditorTableModel.cpp @@ -64,7 +64,7 @@ void ResourceEditorTableModel::addEntry (AbstractResourceEntry* de) { dataModified = true; availableData << de; - emit dataChanged(); + Q_EMIT dataChanged(); } void ResourceEditorTableModel::deleteEntries (QModelIndexList indexList) @@ -77,7 +77,7 @@ void ResourceEditorTableModel::deleteEntries (QModelIndexList indexList) entriesList << entry; for(int i = 0; i < entriesList.size(); i++) availableData.removeAll(entriesList.at(i)); - emit dataChanged(); + Q_EMIT dataChanged(); } #include "ResourceEditorTableModel.moc" diff --git a/lima_pelf/resourceTool/ResourceEditorTableModel.h b/lima_pelf/resourceTool/ResourceEditorTableModel.h index 6336b31f3..ca4c6232b 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableModel.h +++ b/lima_pelf/resourceTool/ResourceEditorTableModel.h @@ -42,7 +42,7 @@ Q_OBJECT void addEntry (); void deleteEntries (QModelIndexList indexList); -public slots: +public Q_SLOTS: void addEntry (AbstractResourceEntry* de); diff --git a/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp b/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp index b8800e42f..8d1dd5db6 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp +++ b/lima_pelf/resourceTool/ResourceEditorTableWidget.cpp @@ -45,12 +45,12 @@ void ResourceEditorTableWidget::editEntry (const QModelIndex& index) if(entry == 0) return; retm->dataModified = true; - emit editEntryDlg(entry); + Q_EMIT editEntryDlg(entry); } void ResourceEditorTableWidget::createEntry () { - emit editEntryDlg(); + Q_EMIT editEntryDlg(); } void ResourceEditorTableWidget::keyPressEvent (QKeyEvent* event) diff --git a/lima_pelf/resourceTool/ResourceEditorTableWidget.h b/lima_pelf/resourceTool/ResourceEditorTableWidget.h index 9baaedc76..0865d64b9 100644 --- a/lima_pelf/resourceTool/ResourceEditorTableWidget.h +++ b/lima_pelf/resourceTool/ResourceEditorTableWidget.h @@ -43,14 +43,14 @@ Q_OBJECT void keyPressEvent (QKeyEvent* event); void contextMenuEvent (QContextMenuEvent* event); -public slots: +public Q_SLOTS: void editEntry (const QModelIndex& index); void createEntry (); void contextEdit (); void contextDelete (); -signals: +Q_SIGNALS: void editEntryDlg (AbstractResourceEntry* are = 0); diff --git a/lima_pelf/resourceTool/ResourceReaderSimpleModel.h b/lima_pelf/resourceTool/ResourceReaderSimpleModel.h index 7fbeec6a5..1292889e8 100644 --- a/lima_pelf/resourceTool/ResourceReaderSimpleModel.h +++ b/lima_pelf/resourceTool/ResourceReaderSimpleModel.h @@ -81,7 +81,7 @@ class ResourceReaderSimpleModel void install (); virtual void installFinished (int exitCode, QProcess::ExitStatus exitStatus); virtual void installError (QProcess::ProcessError error); - virtual void emitDataInstalled (bool success) = 0; // Virtual emit function (needed for class to be polymorphic) + virtual void emitDataInstalled (bool success) = 0; // Virtual Q_EMIT function (needed for class to be polymorphic) protected: QString installComand; diff --git a/lima_pelf/resourceTool/ResourceReaderTableModel.cpp b/lima_pelf/resourceTool/ResourceReaderTableModel.cpp index b95dfcf22..51e2fbf00 100644 --- a/lima_pelf/resourceTool/ResourceReaderTableModel.cpp +++ b/lima_pelf/resourceTool/ResourceReaderTableModel.cpp @@ -72,7 +72,7 @@ void ResourceReaderTableModel::sortByHeader (int column, Qt::SortOrder order) sortedHeaderColumn = column; sortedHeaderOrder = order; qSort(availableData.begin(), availableData.end(), headerLessThan); - emit dataChanged(); + Q_EMIT dataChanged(); } bool ResourceReaderTableModel::headerLessThan (AbstractResourceEntry* entry1, AbstractResourceEntry* entry2) diff --git a/lima_pelf/resourceTool/ResourceReaderTableModel.h b/lima_pelf/resourceTool/ResourceReaderTableModel.h index 427e1d305..05a4950bb 100644 --- a/lima_pelf/resourceTool/ResourceReaderTableModel.h +++ b/lima_pelf/resourceTool/ResourceReaderTableModel.h @@ -52,7 +52,7 @@ Q_OBJECT void sortByHeader (int column, Qt::SortOrder order); static bool headerLessThan (AbstractResourceEntry* entry1, AbstractResourceEntry* entry2); QVariant data (const QModelIndex& index, int role) const; - void emitDataInstalled (bool success) { emit dataInstalled(success); }; // ResourceReaderSimpleModel needed emit function + void emitDataInstalled (bool success) { Q_EMIT dataInstalled(success); }; // ResourceReaderSimpleModel needed Q_EMIT function Q_SIGNALS: diff --git a/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h b/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h index 7186f0771..0fe834881 100644 --- a/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h +++ b/lima_pelf/resourceTool/ResourceReaderToolBoxModel.h @@ -39,7 +39,7 @@ Q_OBJECT ResourceReaderToolBoxModel (QObject * parent = 0); virtual ~ResourceReaderToolBoxModel(); - void emitDataInstalled (bool success) { emit dataInstalled(success); }; // ResourceReaderSimpleModel needed emit function + void emitDataInstalled (bool success) { Q_EMIT dataInstalled(success); }; // ResourceReaderSimpleModel needed Q_EMIT function Q_SIGNALS: void dataInstalled (bool success); // ResourceReaderSimpleModel needed signal diff --git a/lima_pelf/resourceTool/ResourceTool.h b/lima_pelf/resourceTool/ResourceTool.h index 507beb6ef..17bbd15f2 100644 --- a/lima_pelf/resourceTool/ResourceTool.h +++ b/lima_pelf/resourceTool/ResourceTool.h @@ -63,7 +63,7 @@ Q_OBJECT bool popPelfSharedMemory (QString msg); void logDebugMsg (QtMsgType type, const char* msg); -public slots: +public Q_SLOTS: void dictionnaryEditDlg (AbstractResourceEntry* are = 0); void dictionnarySearch (); diff --git a/manageQt5.cmake b/manageQt5.cmake index 45166305d..8c0a883c7 100644 --- a/manageQt5.cmake +++ b/manageQt5.cmake @@ -22,6 +22,7 @@ set(CMAKE_PREFIX_PATH ) # Add definitions and flags +add_definitions(-DQT_NO_KEYWORDS) add_definitions(-DQT_DISABLE_DEPRECATED_BEFORE=0) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -lpthread")