Skip to content

Commit

Permalink
Merge pull request #1 from RyogaMasaki/rewrite
Browse files Browse the repository at this point in the history
Rewrite
  • Loading branch information
drojaazu authored Dec 2, 2019
2 parents a9cc184 + efc357d commit 7e36ecd
Show file tree
Hide file tree
Showing 21 changed files with 832 additions and 1,243 deletions.
12 changes: 12 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Language: Cpp
Standard: Cpp11
BasedOnStyle: LLVM
BreakBeforeBraces: Linux
SpaceBeforeParens: Never
TabWidth: 2
UseTab: Always
AlignTrailingComments: true
AllowShortIfStatementsOnASingleLine: false
IndentCaseLabels: true

Cpp11BracedListStyle: true
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
*.code-workspace
bin
build

etc
27 changes: 27 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
INCLUDE (CheckIncludeFiles)

# define project
cmake_minimum_required (VERSION 3.5)
project (jstrings VERSION 1.1 LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_COMPILER_NAMES clang++ g++ icpc c++ cxx)
# set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG")

if (NOT EXISTS ${CMAKE_BINARY_DIR}/CMakeCache.txt)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
endif()
endif()

# define target
aux_source_directory(${PROJECT_SOURCE_DIR}/src CPPFILES)
add_executable(jstrings ${CPPFILES})

target_include_directories(jstrings PUBLIC "${PROJECT_SOURCE_DIR}/inc")
target_compile_features(jstrings PUBLIC cxx_std_11)
target_link_libraries(jstrings png)

install(TARGETS jstrings
RUNTIME DESTINATION bin)
695 changes: 21 additions & 674 deletions LICENSE

Large diffs are not rendered by default.

29 changes: 0 additions & 29 deletions Makefile

This file was deleted.

46 changes: 29 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,35 +1,47 @@
# jstrings
A tool for finding JIS-based Japanese characters in binary data.
A tool for finding JIS-based Japanese text in binary data.

## Usage
jstrings [options] [input]
jstrings [options] [input_file]

Input can be a filename or data from stdin.
Input can be a filename or data from stdin. Output is sent to stdout.

### Options
-m number
-e encoding
--encoding encoding

Set minimum number of characters to match as a valid string. Default: 10.
Specify the encoding to use. Use one of the strings listed in parantheses below for that encoding:

-e encoding
* Shift-JIS (shift-jis, shiftjis, sjis)
* EUC-JP (euc, euc-jp, eucjp)
* Microsoft CP932 (cp932, windows932, windows31j)

Specify the encoding to use. Currently, the only valid value is "shift-jis". Default: shift-jis
Optional; default is Shift-JIS.

-l
-m number
--match-length number

Use little-endian order for multibyte characters
Set minimum number of characters to match as a valid string. Optional; default is 10.

-jisx0213
-c number
--cutoff number

Use JIS X 0213 character set instead of JIS X 0208 for double byte characters
Limit the output to the specified number of characters for a string. This is useful for "previewing" a file which may have large blocks of junk data that happen to fall within the range of valid encoding values. Optional; default is no cutoff.

## Notes
## Output
Data is output in its original encoding without any conversion. Other tools, such as iconv, can do conversion to something more useful (such as UTF8). For example:

jstrings file.bin | iconv -f SHIFT-JIS -t UTF-8 -c
# for Shift-JIS
jstrings file.bin | iconv -f SHIFT-JIS -t UTF-8 -c | less
# for CP932
jstrings file.bin | iconv -f CP932 -t UTF-8 -c | less
# for EUC-JP
jstrings file.bin | iconv -f EUC-JP -t UTF-8 -c | less

### To Do
- Add support for other JIS encodings: CP932, EUC
- Add support for JIS X 0212 for non-SJIS encodings (only EUC?)
- Add option to only return strings with double-byte characters present
## Building
CMake is used for the build system. From the root directory:

mkdir build && cd build
cmake ..
make
sudo make install
24 changes: 24 additions & 0 deletions inc/enc_cp932.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef ENC_CP932_H
#define ENC_CP932_H

#include "enc_shiftjis.h"

namespace encodings
{

class cp932 : public shift_jis
{
public:
u8 is_valid(u8 const *data);
};

} // namespace encodings

#endif
22 changes: 22 additions & 0 deletions inc/enc_eucjp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/
#ifndef ENC_EUCJP_H
#define ENC_EUCJP_H
#include "encoding.h"

namespace encodings
{

class euc : public encoding
{
public:
euc() : encoding(3){};
u8 is_valid(u8 const *data);
};

} // namespace encodings
#endif // ENC_EUC_H
23 changes: 23 additions & 0 deletions inc/enc_shiftjis.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef ENC_SHIFTJIS_H
#define ENC_SHIFTJIS_H
#include "encoding.h"

namespace encodings
{

class shift_jis : public encoding
{
public:
shift_jis() : encoding(2){};
u8 is_valid(u8 const *data);
};

} // namespace encodings
#endif // ENC_SHIFTJIS_H
32 changes: 32 additions & 0 deletions inc/encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef ENCODING_H
#define ENCODING_H
#include "types.h"

/*!
* \brief Abstract for encoding classes
*/
class encoding
{
public:
encoding(u8 max_seq_len) { this->max_seq_len = max_seq_len; }

/*!
* \brief Determines if the given bytes are a valid byte sequence for the
* encoding. Returns the number of valid bytes if true.
*/
virtual u8 is_valid(u8 const *data) = 0;

const u8 get_max_seq_len() { return this->max_seq_len; }

protected:
u8 max_seq_len;
};

#endif // ENCODING_H
31 changes: 31 additions & 0 deletions inc/main.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <fstream>
#include <iomanip>
#include <iostream>

#include "enc_cp932.h"
#include "enc_eucjp.h"
#include "enc_shiftjis.h"
#include "encoding.h"
#include "types.h"

/*!
* \enum encodings
* \brief List of supported encodings
*/
enum enctypes { shift_jis, cp932, eucjp };

void process_args(int argc, char **argv);

void print_help();
48 changes: 48 additions & 0 deletions inc/types.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef TYPES_H
#define TYPES_H

#include <map>
#include <memory>
#include <stdint.h>
#include <vector>

typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;

typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;

template <typename T> using uptr = std::unique_ptr<T>;
template <typename T> using sptr = std::shared_ptr<T>;

typedef std::map<std::string const, std::string> kvmap;

/*!
* \brief POD structure for containing a found string
*/
class found_string
{
public:
/*!
* \brief The offset of the beginning of the found string relative to the
* start of the stream
*/
off_t address;
/*!
* \brief The extracted string data
*/
std::vector<uint8_t> data;
};

#endif
Loading

0 comments on commit 7e36ecd

Please sign in to comment.