From 896d732cd21b8343c787674c56f9f2bea6fc8c14 Mon Sep 17 00:00:00 2001 From: Damian R Date: Sat, 1 May 2021 08:57:10 +0900 Subject: [PATCH] Corrected segfault issues when using cin and failures in specifying encodings --- .editorconfig | 9 +++ CMakeLists.txt | 2 +- inc/enc_cp932.h | 4 +- inc/enc_eucjp.h | 6 +- inc/enc_shiftjis.h | 6 +- inc/main.h | 31 --------- inc/types.h | 17 ----- src/enc_cp932.cpp | 4 +- src/enc_eucjp.cpp | 2 +- src/enc_shiftjis.cpp | 2 +- src/main.cpp | 149 ++++++++++++++++++++++++------------------- src/project.hpp | 19 ++++++ 12 files changed, 125 insertions(+), 126 deletions(-) create mode 100644 .editorconfig delete mode 100644 inc/main.h create mode 100644 src/project.hpp diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..92940c2 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +[*] +end_of_line = lf +insert_final_newline = true +charset = utf-8 +trim_trailing_whitespace = true + +[*.{c,h,cpp,hpp}] +indent_style = tab +indent_size = 2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 804b399..55b154d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ include(GNUInstallDirs) # define project cmake_minimum_required (VERSION 3.5) -project (jstrings VERSION 1.1 LANGUAGES CXX) +project (jstrings VERSION 1.2 LANGUAGES CXX) set(PROJECT_CONTACT "Damian R (damian@sudden-desu.net)") set(PROJECT_WEBSITE "https://github.com/drojaazu") diff --git a/inc/enc_cp932.h b/inc/enc_cp932.h index 21c2de2..1253730 100644 --- a/inc/enc_cp932.h +++ b/inc/enc_cp932.h @@ -13,11 +13,11 @@ namespace encodings { -class cp932 : public shift_jis +class encoding_cp932 : public encoding_shiftjis { public: u8 is_valid(u8 const *data); - ~cp932(){}; + ~encoding_cp932(){}; }; } // namespace encodings diff --git a/inc/enc_eucjp.h b/inc/enc_eucjp.h index 5575647..60618ac 100644 --- a/inc/enc_eucjp.h +++ b/inc/enc_eucjp.h @@ -11,12 +11,12 @@ namespace encodings { -class euc : public encoding +class encoding_eucjp : public encoding { public: - euc() : encoding(3){}; + encoding_eucjp() : encoding(3){}; u8 is_valid(u8 const *data); - ~euc(){}; + ~encoding_eucjp(){}; }; } // namespace encodings diff --git a/inc/enc_shiftjis.h b/inc/enc_shiftjis.h index a4bc5f8..01c8d42 100644 --- a/inc/enc_shiftjis.h +++ b/inc/enc_shiftjis.h @@ -12,12 +12,12 @@ namespace encodings { -class shift_jis : public encoding +class encoding_shiftjis : public encoding { public: - shift_jis() : encoding(2){}; + encoding_shiftjis() : encoding(2){}; u8 is_valid(u8 const *data); - ~shift_jis(){}; + ~encoding_shiftjis(){}; }; } // namespace encodings diff --git a/inc/main.h b/inc/main.h deleted file mode 100644 index aca4044..0000000 --- a/inc/main.h +++ /dev/null @@ -1,31 +0,0 @@ -/*! - * \author Damian Rogers (damian@sudden-desu.net) - * \version 1.1 - * \date 2019.12.01 - * \copyright MIT License - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "enc_cp932.h" -#include "enc_eucjp.h" -#include "enc_shiftjis.h" -#include "encoding.h" -#include "types.h" - -/*! - * \enum encodings - * \brief List of supported encodings - */ -enum enctypes { shift_jis, cp932, eucjp }; - -void process_args(int argc, char **argv); - -void print_help(); diff --git a/inc/types.h b/inc/types.h index 5577c8a..22f327e 100644 --- a/inc/types.h +++ b/inc/types.h @@ -29,21 +29,4 @@ template using sptr = std::shared_ptr; typedef std::map kvmap; -/*! - * \brief POD structure for containing a found string - */ -class found_string -{ -public: - /*! - * \brief The offset of the beginning of the found string relative to the - * start of the stream - */ - off_t address; - /*! - * \brief The extracted string data - */ - std::vector data; -}; - #endif \ No newline at end of file diff --git a/src/enc_cp932.cpp b/src/enc_cp932.cpp index 2188654..62671ce 100644 --- a/src/enc_cp932.cpp +++ b/src/enc_cp932.cpp @@ -3,9 +3,9 @@ namespace encodings { -u8 cp932::is_valid(u8 const *data) +u8 encoding_cp932::is_valid(u8 const *data) { - u8 valid_count = shift_jis::is_valid(data); + u8 valid_count = encoding_shiftjis::is_valid(data); if(valid_count > 0) return valid_count; else { diff --git a/src/enc_eucjp.cpp b/src/enc_eucjp.cpp index 091bc20..64d397f 100644 --- a/src/enc_eucjp.cpp +++ b/src/enc_eucjp.cpp @@ -5,7 +5,7 @@ namespace encodings { -u8 euc::is_valid(u8 const *data) +u8 encoding_eucjp::is_valid(u8 const *data) { u8 c_hi{*data}; diff --git a/src/enc_shiftjis.cpp b/src/enc_shiftjis.cpp index a745c85..56ea3f9 100644 --- a/src/enc_shiftjis.cpp +++ b/src/enc_shiftjis.cpp @@ -10,7 +10,7 @@ namespace encodings 0208 There is extended support for 0213, though we're not going to fiddle with it Maybe we'll make an extended class */ -u8 shift_jis::is_valid(u8 const *data) +u8 encoding_shiftjis::is_valid(u8 const *data) { // JIS X 0201 - 8-bit characters (including 7-bit ASCII) // excludes non-printable (control code) and reserved bytes diff --git a/src/main.cpp b/src/main.cpp index 854b528..df5f95f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,60 +1,93 @@ #include "main.h" #include "project.hpp" +#include "enc_shiftjis.h" +#include "enc_eucjp.h" +#include "enc_cp932.h" #ifdef DEBUG #include #endif using namespace std; +using namespace encodings; // 512k of buffer -static u32 const DATABUFF_SIZE = 524288; -static u8 const DEFAULT_MATCH_LEN = 10; +static u32 constexpr DATABUFF_SIZE {(1024 * 512)}; +static u8 constexpr DEFAULT_MATCH_LEN {10}; +enum enctypes { shift_jis, cp932, eucjp }; -istream *indata = nullptr; -size_t match_len = DEFAULT_MATCH_LEN; -size_t str_cutoff{0}; -string encoding_str = ""; +struct runtime_config_jstrings { + string infile; + string encoding; + uint match_length = DEFAULT_MATCH_LEN; + uint cutoff {0}; +}; + +void process_args(int argc, char **argv, runtime_config_jstrings &cfg); static const map enclist{ - {"shift-jis", shift_jis}, {"shiftjis", shift_jis}, {"sjis", shift_jis}, - {"cp932", cp932}, {"windows932", cp932}, {"windows31j", cp932}, - {"euc", eucjp}, {"euc-jp", eucjp}, {"eucjp", eucjp}}; + {"shift-jis", shift_jis}, + {"shiftjis", shift_jis}, + {"sjis", shift_jis}, + {"cp932", cp932}, + {"windows932", cp932}, + {"windows31j", cp932}, + {"euc", eucjp}, + {"euc-jp", eucjp}, + {"eucjp", eucjp}}; + +typedef pair> found_string; int main(int argc, char **argv) { - encoding *encoding = nullptr; + runtime_config_jstrings cfg; + + encoding *encoding {nullptr}; + encoding_shiftjis enc_sjis; + encoding_eucjp enc_eucjp; + encoding_cp932 enc_cp932; + vector results; + istream *indata {nullptr}; + ifstream infile; + try { // SETUP - process_args(argc, argv); + process_args(argc, argv, cfg); + + // sanity checks + if(cfg.match_length < 1) + throw invalid_argument("Match length must be a positive value"); - if(indata == nullptr) + if(cfg.infile.empty()) indata = &cin; else { - if(!indata->good()) { + infile = ifstream(cfg.infile); + if(!infile.good()) { throw invalid_argument("File could not be opened"); } + + indata = &infile; indata->seekg(0); } - if(encoding_str.empty()) - encoding = new encodings::shift_jis(); + if(cfg.encoding.empty()) + encoding = &enc_sjis; else { - if(enclist.find(encoding_str) == enclist.end()) { - throw invalid_argument("Invlaid encoding specified"); + if(enclist.find(cfg.encoding) == enclist.end()) { + throw invalid_argument("Invalid encoding specified"); } - switch(enclist.at(encoding_str)) { - case shift_jis: - encoding = new encodings::shift_jis(); + switch(enclist.at(cfg.encoding)) { + case enctypes::shift_jis: + encoding = &enc_sjis; break; - case eucjp: - encoding = new encodings::euc(); + case enctypes::eucjp: + encoding = &enc_eucjp; break; - case cp932: - encoding = new encodings::cp932(); + case enctypes::cp932: + encoding = &enc_cp932; break; default: cerr << "Encoding not yet supported" << endl; @@ -88,7 +121,7 @@ int main(int argc, char **argv) u8 validcount; // work string; where we dump valid bytes found_string workstr; - workstr.data.reserve(match_len); + workstr.second.reserve(cfg.match_length); // the databuff_ptr value when we should read another chunk u32 buffborder; // cache this... @@ -140,33 +173,33 @@ int main(int argc, char **argv) if(glyphcount == 0) { // this is the first character, so store the address where the // beginning of the string was found - workstr.address = stream_ptr; + workstr.first = stream_ptr; } glyphcount++; - if(str_cutoff > 0 && glyphcount >= str_cutoff) { + if(cfg.cutoff > 0 && glyphcount > cfg.cutoff) { databuff_ptr += validcount; stream_ptr += validcount; continue; } std::copy(&databuff[databuff_ptr], &databuff[databuff_ptr + validcount], - std::back_inserter(workstr.data)); + std::back_inserter(workstr.second)); databuff_ptr += validcount; stream_ptr += validcount; } else { // data is invalid // if there are enough characters in the work string, add it to the // list - if(glyphcount >= match_len) { - workstr.data.push_back('\0'); + if(glyphcount >= cfg.match_length) { + workstr.second.push_back('\0'); results.push_back(workstr); } ++databuff_ptr; ++stream_ptr; if(glyphcount > 0) { glyphcount = 0; - workstr.data.clear(); - workstr.data.reserve(match_len); + workstr.second.clear(); + workstr.second.reserve(cfg.match_length); } } } @@ -186,34 +219,23 @@ int main(int argc, char **argv) // RESULTS found_string thisstring; - std::cout << showbase << internal << setfill('0') << hex; + cout << showbase << internal << setfill('0') << hex; - for(size_t siter = 0; siter < results.size(); siter++) { - thisstring = results.at(siter); - std::cout << thisstring.address << " " << &thisstring.data[0] << endl; + for(found_string this_result : results) { + cout << this_result.first << " " << &this_result.second[0] << endl; } - if(indata != &cin) - delete indata; - delete encoding; - // delete results; - return 0; - } catch(const exception &e) { + } + catch(const exception &e) { cerr << "Fatal error: " << e.what() << endl; - - if(indata != &cin) - delete indata; - delete encoding; - // delete results; - return -1; } } -void process_args(int argc, char **argv) +void process_args(int argc, char **argv, runtime_config_jstrings &cfg) { - const char *const short_opts = ":hm:e:lxf"; + const char *const short_opts = ":hm:c:e:"; const option long_opts[] = {{"help", no_argument, nullptr, 'h'}, {"match-length", required_argument, nullptr, 'm'}, {"cutoff", required_argument, nullptr, 'c'}, @@ -222,24 +244,20 @@ void process_args(int argc, char **argv) while(true) { const auto this_opt = - getopt_long(argc, argv, short_opts, long_opts, nullptr); + getopt_long(argc, argv, short_opts, long_opts, nullptr); if(this_opt == -1) break; switch(this_opt) { case 'm': - match_len = strtoul(optarg, nullptr, 10); - if(match_len < 1) - throw invalid_argument("Match length must be a positive value"); + cfg.match_length = strtoul(optarg, nullptr, 10); break; case 'c': - str_cutoff = strtoul(optarg, nullptr, 10); - if(str_cutoff < 1) - throw invalid_argument("Max length must be a positive value"); + cfg.cutoff = strtoul(optarg, nullptr, 10); break; case 'e': - encoding_str = argv[optind]; + cfg.encoding = optarg; break; case 'h': print_help(); @@ -264,20 +282,21 @@ void process_args(int argc, char **argv) if(optind < argc) { // only read the first non-option argument, assuming it is input filename - indata = new ifstream(argv[optind]); + cfg.infile = argv[optind]; } } void print_help() { - cerr << PROJECT::PROJECT_NAME << " - ver. " << PROJECT::VERSION << endl << endl; - cerr << "Valid options:" << endl; - cerr << " --encoding, -e Specify encoding to use" << endl; - cerr << " (Valid options: shiftjis, cp932, eucjp)" << endl; - cerr << " --match-length, -m Specify number of sequential characters " + cout << PROJECT::PROJECT_NAME << " - ver. " << PROJECT::VERSION << endl; + cout << PROJECT::PROJECT_CONTACT << " - " << PROJECT::PROJECT_WEBSITE << endl << endl; + cout << "Valid options:" << endl; + cout << " --encoding, -e Specify encoding to use" << endl; + cout << " (Valid options: shiftjis, cp932, eucjp)" << endl; + cout << " --match-length, -m Specify number of sequential characters " "required to qualify as a string" << endl; - cerr << " --cutoff, -c Specify maximum number of characters to " + cout << " --cutoff, -c Specify maximum number of characters to " "display in a single string" << endl; } diff --git a/src/project.hpp b/src/project.hpp new file mode 100644 index 0000000..efec8b8 --- /dev/null +++ b/src/project.hpp @@ -0,0 +1,19 @@ +#ifndef __MAIN_HPP +#define __MAIN_HPP + +#include + +/* + These values should be set within CMakeLists.txt +*/ +namespace PROJECT { + static unsigned int const VERSION_MAJOR{1}; + static unsigned int const VERSION_MINOR{2}; + static unsigned int const VERSION_PATCH{}; + static std::string const VERSION{"1.2"}; + + static std::string const PROJECT_NAME{"jstrings"}; + static std::string const PROJECT_CONTACT{"Damian R (damian@sudden-desu.net)"}; + static std::string const PROJECT_WEBSITE{"https://github.com/drojaazu"}; +} +#endif