Skip to content

Commit

Permalink
Move character lists data before the byte code in a pattern
Browse files Browse the repository at this point in the history
This ensures aligned data store even when the range is repeated.
Furthermore character lists are stored once regerdless of repeats.
  • Loading branch information
Zoltan Herczeg committed Oct 29, 2024
1 parent 96f0653 commit bf0e3e9
Show file tree
Hide file tree
Showing 13 changed files with 151 additions and 98 deletions.
3 changes: 2 additions & 1 deletion src/pcre2_auto_possess.c
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,8 @@ for(;;)
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
list_ptr[2] + LINK_SIZE, utf)) return FALSE;
list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
return FALSE;
break;
#endif

Expand Down
132 changes: 71 additions & 61 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -6541,94 +6541,76 @@ for (;; pptr++)

if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
{
/* Char lists size is an even number,
because all items are 16 or 32 bit values. */
/* Char lists size is an even number, because all items are 16 or 32
bit values. The character list data is always aligned to 32 bits. */
size_t char_lists_size = cranges->char_lists_size;
PCRE2_ASSERT((char_lists_size & 0x1) == 0);
PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
(cb->char_lists_size & 0x3) == 0);

if (lengthptr != NULL)
{
/* At this point, we don't know the precise location
so the maximum alignment is added to the length. */
char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));

#if PCRE2_CODE_UNIT_WIDTH == 8
*lengthptr += 2 /* sizeof(type) in PCRE2_UCHARs */ +
3 /* maximum alignment. */;
#elif PCRE2_CODE_UNIT_WIDTH == 16
*lengthptr += 1 /* sizeof(type) in PCRE2_UCHARs */ +
1 /* maximum alignment. */;
char_lists_size >>= 1;
*lengthptr += 2 + LINK_SIZE;
#else
*lengthptr += 1 /* sizeof(type) in PCRE2_UCHARs */;
/* Padding, when the size is not divisible by 4. */
if ((char_lists_size & 0x2) != 0)
char_lists_size += 2;
char_lists_size >>= 2;
*lengthptr += 1 + LINK_SIZE;
#endif

if (OFLOW_MAX - *lengthptr < char_lists_size)
{
*errorcodeptr = ERR20; /* Integer overflow */
return 0;
}
cb->char_lists_size += char_lists_size;

*lengthptr += char_lists_size;
char_lists_size /= sizeof(PCRE2_UCHAR);

if (*lengthptr > MAX_PATTERN_SIZE)
/* Storage space for character lists is included
in the maximum pattern size. */
if (*lengthptr > MAX_PATTERN_SIZE ||
MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
{
*errorcodeptr = ERR20; /* Pattern is too large */
return 0;
}
}
else
{
uint8_t *char_buffer = (uint8_t*)code;
uint8_t *data;

PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
#if PCRE2_CODE_UNIT_WIDTH == 8
/* Encode as high / low bytes. */
code[0] = (uint8_t)(XCL_LIST |
(cranges->char_lists_types >> 8));
code[1] = (uint8_t)cranges->char_lists_types;
char_buffer += 2;
code += 2;
#else
*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
#endif

/* Compute alignment. */
if (((uintptr_t)char_buffer & 0x1) != 0)
{
code[0] |= 1u << (XCL_ALIGNMENT_SHIFT - 8);
char_buffer += 1;
}
/* Character lists are stored in backwards direction from
the byte code start which is available in the match block. */

if (((uintptr_t)char_buffer & 0x2) != (char_lists_size & 0x2))
{
code[0] |= 2u << (XCL_ALIGNMENT_SHIFT - 8);
char_buffer += 2;
}
#elif PCRE2_CODE_UNIT_WIDTH == 16
code[0] = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
char_buffer += 2;
cb->char_lists_size += char_lists_size;
data = (uint8_t*)cb->start_code - cb->char_lists_size;

/* Compute alignment. */
if (((uintptr_t)char_buffer & 0x2) != (char_lists_size & 0x2))
{
code[0] |= 2u << XCL_ALIGNMENT_SHIFT;
char_buffer += 2;
}
#else
code[0] = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
char_buffer += 4;
memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
char_lists_size);


/* Since character lists total size is less than MAX_PATTERN_SIZE,
their starting offset fits into a value which size is LINK_SIZE. */

char_lists_size = cb->char_lists_size;
PUT(code, 0, (uint32_t)(char_lists_size >> 1));
code += LINK_SIZE;

/* Padding. */
if ((char_lists_size & 0x2) != 0)
{
code[0] |= 2u << XCL_ALIGNMENT_SHIFT;
char_buffer += 2;
}
cb->char_lists_size += 2;
/* Make tools happy by setting memory data. */
((uint16_t*)data)[-1] = 0xffff;
#ifdef SUPPORT_VALGRIND
VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
#endif
memcpy(char_buffer,
(uint8_t*)(cranges + 1) + cranges->char_lists_start,
char_lists_size);

code = (PCRE2_UCHAR*)(char_buffer + char_lists_size);
}

cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
}
Expand Down Expand Up @@ -10558,6 +10540,7 @@ cb.workspace_size = COMPILE_WORK_SIZE;
#ifdef SUPPORT_WIDE_CHARS
cb.cranges = NULL;
cb.next_cranges = NULL;
cb.char_lists_size = 0;
#endif

/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
Expand Down Expand Up @@ -10954,13 +10937,36 @@ if (length > MAX_PATTERN_SIZE)
goto HAD_CB_ERROR;
}

#if defined SUPPORT_WIDE_CHARS
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
if (MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
{
errorcode = ERR20;
goto HAD_CB_ERROR;
}
#endif

/* Compute the size of, then, if not too large, get and initialize the data
block for storing the compiled pattern and names table. Integer overflow should
no longer be possible because nowadays we limit the maximum value of
cb.names_found and cb.name_entry_size. */

re_blocksize = CU2BYTES(length +
(PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
re_blocksize =
CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);

#if defined SUPPORT_WIDE_CHARS
if (cb.char_lists_size != 0)
{
#if PCRE2_CODE_UNIT_WIDTH != 32
/* Align to 32 bit first. This ensures the
allocated area will also be 32 bit aligned. */
re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
#endif
re_blocksize += cb.char_lists_size;
}
#endif

re_blocksize += CU2BYTES(length);

if (re_blocksize > ccontext->max_pattern_compiled_length)
{
Expand Down Expand Up @@ -10989,6 +10995,7 @@ re->tables = tables;
re->executable_jit = NULL;
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
re->blocksize = re_blocksize;
re->code_start = re_blocksize - CU2BYTES(length);
re->magic_number = MAGIC_NUMBER;
re->compile_options = options;
re->overall_options = cb.external_options;
Expand All @@ -11012,8 +11019,7 @@ re->optimization_flags = optim_flags;
/* The basic block is immediately followed by the name table, and the compiled
code follows after that. */

codestart = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_entry_size * re->name_count;
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);

/* Update the compile data block for the actual compile. The starting points of
the name/number translation table and of the code are passed around in the
Expand All @@ -11028,6 +11034,10 @@ cb.start_code = codestart;
cb.req_varyopt = 0;
cb.had_accept = FALSE;
cb.had_pruneorskip = FALSE;
#ifdef SUPPORT_WIDE_CHARS
cb.char_lists_size = 0;
#endif


/* If any named groups were found, create the name/number table from the list
created in the pre-pass. */
Expand Down
3 changes: 3 additions & 0 deletions src/pcre2_compile.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ therefore no need for it to have a length entry, so use a high value. */
#define SELECT_VALUE8(value8, value) (value)
#endif

/* Macro for aligning data. */
#define CLIST_ALIGN_TO(base, align) \
((base + ((size_t)(align) - 1)) & ~((size_t)(align) - 1))

/* Merge intersecting ranges of classes. */

Expand Down
7 changes: 4 additions & 3 deletions src/pcre2_dfa_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -2690,7 +2690,9 @@ for (;;)
else
{
ecode = code + GET(code, 1);
if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
if (clen > 0) isinclass =
PRIV(xclass)(c, code + 1 + LINK_SIZE,
(const uint8_t*)mb->start_code, utf);
}

/* At this point, isinclass is set for all kinds of class, and ecode
Expand Down Expand Up @@ -3522,8 +3524,7 @@ if (mb->match_limit_depth > re->limit_depth)
if (mb->heap_limit > re->limit_heap)
mb->heap_limit = re->limit_heap;

mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_count * re->name_entry_size;
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
mb->tables = re->tables;
mb->start_subject = subject;
mb->end_subject = end_subject;
Expand Down
12 changes: 3 additions & 9 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1351,9 +1351,8 @@ contain characters with values greater than 255. */
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
/* This value represents the beginning of character lists. The value
is 16 bit long, and stored as a high and low byte pair in 8 bit mode.
The lower 12 bit contains information about character lists (see later)
and next two bits contains the alignment (padding) data. */
#define XCL_LIST (sizeof(PCRE2_UCHAR) == 1 ? 0x40 : 0x4000)
The lower 12 bit contains information about character lists (see later). */
#define XCL_LIST (sizeof(PCRE2_UCHAR) == 1 ? 0x10 : 0x1000)

/* When a character class contains many characters/ranges,
they are stored in character lists. There are four character
Expand Down Expand Up @@ -1416,11 +1415,6 @@ represents that the item count is stored at the begining of the
character list. The item count has the same width as the items
in the character list (e.g. 16 bit for Low16 and High16 lists). */
#define XCL_ITEM_COUNT_MASK 0x3
/* Shift and mask for getting alignment data. The items of a character
list are always naturally aligned. Adding this value to the byte position
of the XCL_LIST header ensures the required alignment of the items. */
#define XCL_ALIGNMENT_SHIFT 12
#define XCL_ALIGNMENT_MASK 0x3
/* Shift and flag for constructing character list items. The XCL_CHAR_END
is set, when the item is not the beginning of a range. The XCL_CHAR_SHIFT
can be used to encode / decode the character value stored in an item. */
Expand Down Expand Up @@ -2182,7 +2176,7 @@ extern int _pcre2_study(pcre2_real_code *);
extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *);
extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
uint32_t *, BOOL);
extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL);
extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, const uint8_t *, BOOL);

/* This function is needed only when memmove() is not available. */

Expand Down
6 changes: 4 additions & 2 deletions src/pcre2_intmodedep.h
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ typedef struct pcre2_real_code {
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
CODE_BLOCKSIZE_TYPE code_start; /* Byte code start offset */
uint32_t magic_number; /* Paranoid and endianness check */
uint32_t compile_options; /* Options passed to pcre2_compile() */
uint32_t overall_options; /* Options after processing the pattern */
Expand Down Expand Up @@ -786,8 +787,9 @@ typedef struct compile_block {
BOOL had_recurse; /* Had a pattern recursion or subroutine call */
BOOL dupnames; /* Duplicate names exist */
#ifdef SUPPORT_WIDE_CHARS
class_ranges* cranges; /* First class range. */
class_ranges* next_cranges; /* Next class range. */
class_ranges *cranges; /* First class range. */
class_ranges *next_cranges; /* Next class range. */
size_t char_lists_size; /* Current size of character lists */
#endif
} compile_block;

Expand Down
6 changes: 3 additions & 3 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7767,8 +7767,7 @@ cc++;
#endif /* CODE_UNIT_WIDTH */

/* Align characters. */
next_char = (const uint8_t*)cc;
next_char += (type >> XCL_ALIGNMENT_SHIFT) & XCL_ALIGNMENT_MASK;
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
type &= XCL_TYPE_MASK;

/* Estimate size. */
Expand Down Expand Up @@ -7907,6 +7906,7 @@ while (type > 0)
}

SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1));
SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);
ranges->range_count = range_count;
}

Expand Down Expand Up @@ -14691,7 +14691,7 @@ memset(&rootbacktrack, 0, sizeof(backtrack_common));
memset(common, 0, sizeof(compiler_common));
common->re = re;
common->name_table = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
rootbacktrack.cc = common->name_table + re->name_count * re->name_entry_size;
rootbacktrack.cc = (PCRE2_SPTR)((uint8_t *)re + re->code_start);

#ifdef SUPPORT_UNICODE
common->invalid_utf = (mode & PCRE2_JIT_INVALID_UTF) != 0;
Expand Down
13 changes: 9 additions & 4 deletions src/pcre2_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -2239,7 +2239,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
if (!PRIV(xclass)(fc, Lxclass_data,
(const uint8_t*)mb->start_code, utf))
RRETURN(MATCH_NOMATCH);
}

/* If Lmax == Lmin we can just continue with the main loop. */
Expand All @@ -2262,7 +2264,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
if (!PRIV(xclass)(fc, Lxclass_data,
(const uint8_t*)mb->start_code, utf))
RRETURN(MATCH_NOMATCH);
}
PCRE2_UNREACHABLE(); /* Control never reaches here */
}
Expand All @@ -2285,7 +2289,8 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
#else
fc = *Feptr;
#endif
if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
if (!PRIV(xclass)(fc, Lxclass_data,
(const uint8_t*)mb->start_code, utf)) break;
Feptr += len;
}

Expand Down Expand Up @@ -7165,7 +7170,7 @@ given name, for condition testing. The code follows the name table. */
mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code));
mb->name_count = re->name_count;
mb->name_entry_size = re->name_entry_size;
mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);

/* Process the \R and newline settings. */

Expand Down
Loading

0 comments on commit bf0e3e9

Please sign in to comment.