diff --git a/DESCRIPTION b/DESCRIPTION index 8997757..e20eadd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rEDM Type: Package Title: Empirical Dynamic Modeling ('EDM') -Version: 1.15.1 -Date: 2023-10-27 +Version: 1.15.3 +Date: 2023-12-01 Authors@R: c( person("Joseph", "Park", role = c("aut", "cre"), email = "JosephPark@IEEE.org", comment = c(ORCID = "0000-0001-5411-1409")), diff --git a/NEWS.md b/NEWS.md index d8cf981..1e0e056 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,8 @@ - `CCM()` `replacement` parameter removed. - Legacy overload functions removed. - Version 1.15.1 `ignoreNan` added in `PredictNonlinear()`. Replace unicode in pLot labels with mathplot expression. cppEDM initialize `nanFound` in DataFrame.h for UBSAN. Sync with cppEDM 1.15.1. +- Version 1.15.2 Allow `columns` names with spaces. If the `columns` argument is a string use the "," delimiter to separate names. Remove `SMap` warning for disjoint library. +- Version 1.15.3 Allow `columns` and `target` names with spaces in CCM. ##### Version 1.14 - cppEDM core added `generateLibrary` parameter to `Simplex()` and `SMap()`. If `TRUE` the state-space library has newly generated points added. Not available due to Rcpp 20 parameter limit. diff --git a/R/EDM.R b/R/EDM.R index 0997fc1..f937f6b 100644 --- a/R/EDM.R +++ b/R/EDM.R @@ -41,8 +41,9 @@ Embed = function( path = "./", } # If columns are vectors/list, convert to string for cppEDM - if ( is.vector( columns ) || is.list( columns ) ) { - columns = paste( columns, collapse = " " ) + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM + if ( ! is.character( columns ) || length( columns ) > 1 ) { + columns = FlattenToString( columns, "," ) } # Mapped to Embed_rcpp() (Embed.cpp) in RcppEDMCommon.cpp @@ -102,6 +103,7 @@ Simplex = function( pathIn = "./", } # If lib, pred, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -109,7 +111,10 @@ Simplex = function( pathIn = "./", pred = FlattenToString( pred ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # NOTE: Rcpp has a 20 argument limit! @@ -196,6 +201,7 @@ SMap = function( pathIn = "./", } # If lib, pred, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -203,7 +209,10 @@ SMap = function( pathIn = "./", pred = FlattenToString( pred ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # NOTE: Rcpp has a 20 argument limit! @@ -287,6 +296,7 @@ Multiview = function( pathIn = "./", } # If lib, pred, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -294,7 +304,10 @@ Multiview = function( pathIn = "./", pred = FlattenToString( pred ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # NOTE: Rcpp has a 20 argument limit! @@ -390,11 +403,21 @@ CCM = function( pathIn = "./", } # If libSizes, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM + # NOTE: CCM can have multiple target if ( ! is.character( libSizes ) || length( libSizes ) > 1 ) { libSizes = FlattenToString( libSizes ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( ! is.character( target ) || length( target ) > 1 ) { + columns = FlattenToString( target, "," ) + } + else { + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM + } } # NOTE: Rcpp has a 20 argument limit! @@ -493,6 +516,7 @@ EmbedDimension = function ( pathIn = "./", } # If lib, pred, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -500,7 +524,10 @@ EmbedDimension = function ( pathIn = "./", pred = FlattenToString( pred ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # Mapped to EmbedDimension_rcpp() (EmbedDim.cpp) in RcppEDMCommon.cpp @@ -574,6 +601,7 @@ PredictInterval = function( pathIn = "./", } # If lib, pred, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -581,7 +609,10 @@ PredictInterval = function( pathIn = "./", pred = FlattenToString( pred ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # Mapped to PredictInterval_rcpp() (PredictInterval.cpp) in RcppEDMCommon.cpp @@ -658,6 +689,7 @@ PredictNonlinear = function( pathIn = "./", } # If lib, pred, theta, columns are vectors/list, convert to string for cppEDM + # NOTE: columns joined on ',' to enable names with whitespace in cppEDM if ( ! is.character( lib ) || length( lib ) > 1 ) { lib = FlattenToString( lib ) } @@ -668,7 +700,10 @@ PredictNonlinear = function( pathIn = "./", theta = FlattenToString( theta ) } if ( ! is.character( columns ) || length( columns ) > 1 ) { - columns = FlattenToString( columns ) + columns = FlattenToString( columns, "," ) + } + if ( length( strsplit( target, ' ' )[[1]] ) > 1 ) { + target = paste0( target, ',' ) # space in target: add , for cppEDM } # Mapped to PredictNonlinear_rcpp() (PredictNL.cpp) in RcppEDMCommon.cpp diff --git a/R/EDM_AuxFuncs.R b/R/EDM_AuxFuncs.R index e7c7002..8343b8f 100644 --- a/R/EDM_AuxFuncs.R +++ b/R/EDM_AuxFuncs.R @@ -14,26 +14,29 @@ ComputeError = function( obs, pred ) { #------------------------------------------------------------------------ # #------------------------------------------------------------------------ -FlattenToString = function( x ) { - # R is wonderful... is.vector( list() ) is TRUE is.list( data.frame ) TRUE +FlattenToString = function( x, delimiter = " " ) { + # R is Bizarre... does not have a consistent type system + # is.vector( list() ) is TRUE; is.list( data.frame() ) is TRUE + # length( 'xxx' ) is 1; length( c('xxx' ) ) is 1 + # nchar ( 'xxx' ) is 3; nchar ( c('xxx' ) ) is 3 # Test for data.frame or matrix first, then list, then vector # or, use class string as selector if ( is.data.frame( x ) || is.matrix( x ) ) { s = "" for( row in 1:nrow( x ) ) { - s = paste( s, paste( x[row,], collapse = " " ), collapse = " " ) + s = paste( s, paste( x[row,], collapse = delimiter ), + collapse = delimiter ) } } else if ( is.list( x ) ) { - s = paste( unlist( x ), collapse = " " ) + s = paste( unlist( x ), collapse = delimiter ) } else if ( is.vector( x ) ) { - s = paste( x, collapse = " " ) + s = paste( x, collapse = delimiter ) } else { s = x } - return ( s ) } @@ -41,7 +44,8 @@ FlattenToString = function( x ) { # Validate dataFrame, or load dataFile and create dataFrame to validate #------------------------------------------------------------------------ ValidateDataFrame = function( pathIn, dataFile, dataFrame, - columns, target, noTime ) { + columns, target, noTime, + verbose = FALSE ) { if ( nchar( dataFile ) ) { # Shame to read the data just for this... anti Big Data. R fails anyway. @@ -61,10 +65,35 @@ ValidateDataFrame = function( pathIn, dataFile, dataFrame, print( "Error: ValidateDataFrame(): dataFrame is not valid." ) return( FALSE ) } + if ( verbose ) { + print( "ValidateDataFrame(): dataFrame is valid." ) + } + + columnNames = names( df ) # Names from data.frame itself + + # Names from API input columns and target + # Is there ',' in API string for name with whitespace? + # regex for multiple whitespace : "\\s+" + if ( length( columns ) > 1 ) { + columnVec = columns # Vector of strings passed in columns, use as-is + } + else { + if ( TRUE %in% grepl( ",", columns ) ) { regex_delimiters = ",+" } + else { regex_delimiters = "\\s+" } + columnVec = strsplit( trimws( columns ), regex_delimiters )[[1]] + } - columnNames = names( df ) - columnVec = strsplit( trimws( columns ), "\\s+" )[[1]] # split on whitespace - targetVec = strsplit( trimws( target ), "\\s+" )[[1]] # split on whitespace + if ( length( target ) > 1 ) { + columnVec = columns # Vector of strings passed in target, use as-is + } + else { + if ( TRUE %in% grepl( ",", target ) ) { + targetVec = strsplit( trimws( columns ), ",+" ) # CCM can have multiple + } + else { + targetVec = c( target ) # No ',' in target string, take as-is + } + } for ( target in targetVec ) { if ( length( df[,target] ) == 0 ) { @@ -79,7 +108,7 @@ ValidateDataFrame = function( pathIn, dataFile, dataFrame, return( FALSE ) } } - + for ( column in columnVec ) { if ( length( df[,column] ) == 0 ) { print( paste("Error: ValidateDataFrame(): Column", column, "is empty.")) @@ -94,6 +123,9 @@ ValidateDataFrame = function( pathIn, dataFile, dataFrame, } } + if ( verbose ) { + print( "ValidateDataFrame(): dataFrame validated." ) + } return( TRUE ) } @@ -164,18 +196,18 @@ PlotObsPred = function( df, # stats: {'MAE': 0., 'RMSE': 0., 'rho': 0. } stats = ComputeError( df $ Observations, df $ Predictions ) - + title = paste( "\nE=", E, " Tp=", Tp, " rho=", round( stats[['rho']], 2 ), " RMSE=", round( stats[['RMSE']], 2 ) ) - + plot( time, df $ Observations, main = title, xlab = names(df)[1], ylab = "", type = "l", col = "blue", lwd = 3, cex.axis = 1.3, cex.lab = 1.3 ) - + lines( time, df $ Predictions, col = "red", lwd = 3 ) - + legend( 'topright', c( "Predictions", "Observations" ), fill = c('red', 'blue' ), bty = 'n', cex = 1.2 ) } @@ -204,7 +236,7 @@ PlotSmap = function( SmapList, print( "PlotSmap: expected at least 3 columns in predictions." ) return( 0 ) } - + # Try to convert first column to Date or POSIXlt or numeric time = NULL if ( is.numeric( p[,1] ) ) { @@ -225,18 +257,18 @@ PlotSmap = function( SmapList, } numCoeff = ncol( c ) - 1 - + old.par = par( no.readonly = TRUE ) - + par( mfrow = c( numCoeff + 1, 1 ), mar = c( 3.5, 4, 0.5, 1 ), mgp = c( 1.5, 0.5, 0 ), cex.axis = 1.3, cex.lab = 1.3 ) - + # Observations & Predictions plot( time, p $ Observations, xlab = names(p)[1], ylab = "", type = "l", col = "blue", lwd = 3, cex.axis = 1.3, cex.lab = 1.3 ) - + lines( time, p $ Predictions, col = "red", lwd = 3 ) legend( 'topright', c( "Predictions", "Observations" ), fill = c('red', 'blue' ), bty = 'n', cex = 1.5 ) @@ -273,7 +305,7 @@ SurrogateData = function( method = c("random_shuffle", "ebisuzaki", "seasonal"), num_surr = 100, T_period = 1, alpha = 0 ) { - + method = match.arg(method) if( method == "random_shuffle" ) { return( sapply( 1:num_surr, function(i) { @@ -284,16 +316,16 @@ SurrogateData = function( if( any( ! is.finite(ts) ) ) { stop("SurrogateData(): input time series contained invalid values") } - + n = length(ts) n2 = floor(n/2) - + mu = mean(ts) sigma = sd(ts) a = fft(ts) amplitudes = abs(a) amplitudes[1] = 0 - + return( sapply(1:num_surr, function(i) { if(n %% 2 == 0) # even length { @@ -318,7 +350,7 @@ SurrogateData = function( if( any(!is.finite(ts)) ) { stop("SurrogateData(): input time series contained invalid values") } - + n = length(ts) I_season = suppressWarnings( matrix( 1:T_period, nrow = n, ncol = 1 ) ) diff --git a/man/CCM.Rd b/man/CCM.Rd index f2ba710..98faef6 100644 --- a/man/CCM.Rd +++ b/man/CCM.Rd @@ -30,10 +30,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{libSizes}{string of 3 whitespace separated integer values specifying the intial library size, the final library size, diff --git a/man/Embed.Rd b/man/Embed.Rd index f01ea1f..18627d9 100644 --- a/man/Embed.Rd +++ b/man/Embed.Rd @@ -23,8 +23,9 @@ index or time values. The columns must be named. One of \item{tau}{integer time delay embedding lag specified as number of time column rows.} -\item{columns}{string of whitespace separated column name(s) in the -input data to be embedded.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} \item{verbose}{logical to produce additional console reporting.} } diff --git a/man/EmbedDimension.Rd b/man/EmbedDimension.Rd index a8e853c..6cb1f17 100644 --- a/man/EmbedDimension.Rd +++ b/man/EmbedDimension.Rd @@ -39,10 +39,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{embedded}{logical specifying if the input data are embedded.} diff --git a/man/Multiview.Rd b/man/Multiview.Rd index a92ccea..0c18faf 100644 --- a/man/Multiview.Rd +++ b/man/Multiview.Rd @@ -35,10 +35,11 @@ series to forecast.} \item{tau}{lag of time delay embedding specified as number of time column rows.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create multivariable data sets.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{multiview}{number of multiview ensembles to average for the final prediction estimate.} diff --git a/man/PredictInterval.Rd b/man/PredictInterval.Rd index fa3b529..e8fdcb3 100644 --- a/man/PredictInterval.Rd +++ b/man/PredictInterval.Rd @@ -39,10 +39,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{embedded}{logical specifying if the input data are embedded.} diff --git a/man/PredictNonlinear.Rd b/man/PredictNonlinear.Rd index 766def4..31a158c 100644 --- a/man/PredictNonlinear.Rd +++ b/man/PredictNonlinear.Rd @@ -47,10 +47,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{embedded}{logical specifying if the input data are embedded.} diff --git a/man/SMap.Rd b/man/SMap.Rd index da214dc..685c27d 100644 --- a/man/SMap.Rd +++ b/man/SMap.Rd @@ -42,10 +42,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{embedded}{logical specifying if the input data are embedded.} diff --git a/man/Simplex.Rd b/man/Simplex.Rd index 6c9624f..254b75b 100644 --- a/man/Simplex.Rd +++ b/man/Simplex.Rd @@ -41,10 +41,11 @@ time column rows.} \item{exclusionRadius}{excludes vectors from the search space of nearest neighbors if their relative time index is within exclusionRadius.} -\item{columns}{string of whitespace separated column name(s) in the -input data used to create the library.} +\item{columns}{string of whitespace separated column name(s), or vector +of column names used to create the library. If individual column names +contain whitespace place names in a vector, or, append ',' to the name.} -\item{target}{column name in the input data used for prediction.} +\item{target}{column name used for prediction.} \item{embedded}{logical specifying if the input data are embedded.} diff --git a/src/cppEDM/src/API.cc b/src/cppEDM/src/API.cc index 56de140..d46d2bc 100644 --- a/src/cppEDM/src/API.cc +++ b/src/cppEDM/src/API.cc @@ -464,9 +464,6 @@ SMapValues SMap( DataFrame< double > & DF, msg << "WARNING: SMap() " << DF.NanRows().size() << " nan rows detected in columns or target. " << "Original number of rows " << DF.NRows() << ".\n"; - if ( not parameters.embedded ) { - msg << "Time delay embedding presumption violated.\n"; - } std::cout << msg.str(); if ( parameters.verbose ) { diff --git a/src/cppEDM/src/CCM.cc b/src/cppEDM/src/CCM.cc index 2a5dafa..988c8bc 100644 --- a/src/cppEDM/src/CCM.cc +++ b/src/cppEDM/src/CCM.cc @@ -40,7 +40,6 @@ void CCMClass::Project () { // with calls to FindNeighbors(), Simplex() in CrossMap() colToTarget.PrepareEmbedding(); // embedding, target targetToCol.PrepareEmbedding(); // embedding, target - colToTarget.Distances(); // allDistances, allLibRows targetToCol.Distances(); // allDistances, allLibRows @@ -427,10 +426,22 @@ void CCMClass::SetupParameters() { // Swap column : target in targetToCol.parameters // NOTE: CCM allows multiple targets for multivariate or mixed - // embeddings. The targets now become the state-space columns, + // embeddings. The targets become the state-space columns, // and the first columns becomes the univariate target. + + // To support whitespace in column names it is expected there + // is a ',' in the columns_str and target_str if the name has + // whitespace. Have to check for ',' in original columns_str + // and manually add ',' to columnNames for Validate() + if ( parameters.columns_str.find( ',' ) != parameters.columns_str.npos ) { + targetToCol.parameters.target_str = + parameters.columnNames.front().append(","); + } + else { + targetToCol.parameters.target_str = parameters.columnNames.front(); + } + targetToCol.parameters.columns_str = parameters.target_str; - targetToCol.parameters.target_str = parameters.columnNames.front(); targetToCol.parameters.Validate(); //------------------------------------------------------------------ @@ -473,17 +484,30 @@ void CCMClass::SetupParameters() { // //---------------------------------------------------------------- void CCMClass::FormatOutput () { - // Create unified column names of output DataFrame - std::stringstream libRhoNames; - libRhoNames << "LibSize " - << parameters.columnNames.front() << ":" - << parameters.targetNames.front() << " " - << parameters.targetNames.front() << ":" - << parameters.columnNames.front(); - - // Allocate unified LibStats output DataFrame in EDM object + // Create unified column names for output DataFrame + std::vector< std::string > libRhoNames; + libRhoNames.push_back( "LibSize" ); + + // allLibStats column names use the first column or target names + std::string columnName = parameters.columnNames.front(); + std::string targetName = parameters.targetNames.front(); + + if ( columnName.back() == ',' ) { + // Remove trailing ',' for allLibStats column name + columnName.erase( columnName.end() - 1 ); + } + + std::stringstream ssColTar; + ssColTar << columnName << ":" << targetName; + libRhoNames.push_back( ssColTar.str() ); + + std::stringstream ssTarCol; + ssTarCol << targetName << ":" << columnName; + libRhoNames.push_back( ssTarCol.str() ); + + // Create unified output DataFrame allLibStats = DataFrame< double >( parameters.librarySizes.size(), 3, - libRhoNames.str() ); + libRhoNames ); allLibStats.WriteColumn( 0, colToTargetValues.LibStats.Column( 0 ) ); allLibStats.WriteColumn( 1, colToTargetValues.LibStats.Column( 1 ) ); diff --git a/src/cppEDM/src/Common.cc b/src/cppEDM/src/Common.cc index 3e2222d..a8c1fe0 100644 --- a/src/cppEDM/src/Common.cc +++ b/src/cppEDM/src/Common.cc @@ -36,9 +36,9 @@ std::string ToLower( std::string str ) { // // Return: vector of tokens //---------------------------------------------------------------- -std::vector SplitString( std::string inString, - std::string delimeters ) { - +std::vector SplitString( std::string inString, + std::string delimeters, + bool removeWhitespace ) { size_t pos = 0; size_t eos = 0; size_t wordStart = 0; @@ -78,9 +78,10 @@ std::vector SplitString( std::string inString, word = inString.substr( wordStart, wordEnd - wordStart ); - // remove whitespace - word.erase( std::remove_if( word.begin(), word.end(), ::isspace ), - word.end() ); + if ( removeWhitespace ) { + word.erase( std::remove_if( word.begin(), word.end(), ::isspace ), + word.end() ); + } splitString.push_back( word ); } diff --git a/src/cppEDM/src/Common.h b/src/cppEDM/src/Common.h index 8396c9c..0e9a18e 100644 --- a/src/cppEDM/src/Common.h +++ b/src/cppEDM/src/Common.h @@ -76,7 +76,8 @@ struct MultiviewValues { std::string ToLower( std::string str ); std::vector SplitString( std::string inString, - std::string delimeters ); + std::string delimeters, + bool removeWhitespace ); VectorError ComputeError( std::valarray< double > obs, std::valarray< double > pred ); diff --git a/src/cppEDM/src/DataFrame.h b/src/cppEDM/src/DataFrame.h index 3f00571..9730907 100644 --- a/src/cppEDM/src/DataFrame.h +++ b/src/cppEDM/src/DataFrame.h @@ -13,9 +13,10 @@ #include #include -// Common.cc +// Common.cc : default delimeters = "," for .csv extern std::vector SplitString( std::string inString, - std::string delimeters = "," ); + std::string delimeters = ",", + bool removeWhitespace = true ); // Type definition for CSV NamedData to pair column names & column data typedef std::vector>> NamedData; @@ -80,7 +81,7 @@ class DataFrame { //----------------------------------------------------------------- // Empty DataFrame of size (rows, columns) with column names in a - // single whitespace delimited string. + // single string. //----------------------------------------------------------------- DataFrame( size_t rows, size_t columns, std::string colNames ): n_rows( rows ), n_columns( columns ), elements( columns * rows ), @@ -247,7 +248,6 @@ class DataFrame { std::vector< size_t > col_i_vec; // Map column names to indices - std::vector< std::string >::iterator si; for ( auto ci = colNames.begin(); ci != colNames.end(); ++ci ) { auto si = find( columnNames.begin(), columnNames.end(), *ci ); @@ -442,7 +442,16 @@ class DataFrame { void BuildColumnNameIndex( std::string colNames ) { // If colNames provided populate columnNames, columnNameToIndex if ( colNames.size() ) { - columnNames = SplitString( colNames, " ,\t" ); + + // If ',' in colNames, ignore whitespace in delimeter + // to allow space in names + if ( colNames.find( ',' ) != colNames.npos ) { + columnNames = SplitString( colNames, ",", false ); + } + else { + columnNames = SplitString( colNames, " \t,\n" ); + } + if ( columnNames.size() != n_columns ) { std::stringstream errMsg; errMsg << "DataFrame::BuildColumnNameIndex(s) " @@ -693,7 +702,8 @@ class DataFrame { std::vector< std::string > colNames; // First line of .csv is REQUIRED header / column names - std::vector firstLineWords = SplitString( dataLines[0] ); + std::vector firstLineWords = + SplitString( dataLines[0], ",", false ); // Get named columns from header line for (size_t colIdx = 0; colIdx < firstLineWords.size(); colIdx++){ diff --git a/src/cppEDM/src/Eval.cc b/src/cppEDM/src/Eval.cc index 3d77c79..8766c3e 100644 --- a/src/cppEDM/src/Eval.cc +++ b/src/cppEDM/src/Eval.cc @@ -555,7 +555,7 @@ DataFrame< double > PredictNonlinear( DataFrame< double > & data, // Use theta values passed in as parameter string ThetaValues.clear(); - std::vector< std::string > theta_vec = SplitString( theta, " \t,\n" ); + std::vector< std::string > theta_vec = SplitString(theta," \t,\n",true); try { for ( auto ci = theta_vec.begin(); ci != theta_vec.end(); ++ci ) { diff --git a/src/cppEDM/src/Parameter.cc b/src/cppEDM/src/Parameter.cc index b33a06b..f0bd77c 100644 --- a/src/cppEDM/src/Parameter.cc +++ b/src/cppEDM/src/Parameter.cc @@ -102,7 +102,7 @@ Parameters::Parameters( validated ( false ), // Instantiate Version - version( 1, 15, 1, "2023-10-27" ) + version( 1, 15, 3, "2023-11-31" ) { // Constructor code if ( method != Method::None ) { @@ -144,8 +144,16 @@ void Parameters::Validate() { //-------------------------------------------------------------- if ( columns_str.size() ) { - std::vector columns_vec = SplitString( columns_str, - " \t,\n" ); + std::vector columns_vec; + + // If ',' in columns_str, do not use whitespace delimiter + // to allow space in names + if ( columns_str.find( ',' ) != columns_str.npos ) { + columns_vec = SplitString( columns_str, ",", false ); + } + else { + columns_vec = SplitString( columns_str, " \t,\n", true ); + } columnNames = columns_vec; } @@ -161,8 +169,16 @@ void Parameters::Validate() { // All other use targetName[0]. //-------------------------------------------------------------- if ( target_str.size() ) { - std::vector columns_vec = SplitString( target_str, - " \t,\n" ); + std::vector columns_vec; + + // If ',' in target_str, do not use whitespace delimiter + // to allow space in names + if ( target_str.find( ',' ) != target_str.npos ) { + columns_vec = SplitString( target_str, ",", false ); + } + else { + columns_vec = SplitString( target_str, " \t,\n", true ); + } targetNames = columns_vec; } @@ -201,7 +217,8 @@ void Parameters::Validate() { // if increment < stop generate the library sequence. // if increment > stop presume list of 3 library sizes. // 2) Otherwise: "x y ..." : list of library sizes. - std::vector libsize_vec = SplitString(libSizes_str," \t,"); + std::vector libsize_vec = SplitString( libSizes_str, + " \t,", true ); bool libSizeSequence = false; int start; @@ -351,7 +368,7 @@ void Parameters::Validate() { //-------------------------------------------------------------- if ( lib_str.size() ) { // Parse lib_str into vector of strings - std::vector lib_vec = SplitString( lib_str, " \t," ); + std::vector lib_vec = SplitString( lib_str, " \t,", true ); if ( lib_vec.size() % 2 != 0 ) { std::string errMsg( "Parameters::Validate(): " "library must be even number of integers.\n" ); @@ -455,7 +472,7 @@ void Parameters::Validate() { //-------------------------------------------------------------- if ( pred_str.size() ) { // Parse pred_str into vector of strings - std::vector pred_vec = SplitString( pred_str, " \t," ); + std::vector pred_vec = SplitString(pred_str, " \t,", true); if ( pred_vec.size() % 2 != 0 ) { std::string errMsg( "Parameters::Validate(): " "prediction must be even number of integers.\n");