1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
|
From aee6abb2400b9a955c2b41166db1c22f63ad42ef Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@aerifal.cx>
Date: Thu, 6 Oct 2016 12:15:47 -0400
Subject: fix regexec with haystack strings longer than INT_MAX
we inherited from TRE regexec code that's utterly wrong with respect
to the integer types it's using. while it doesn't appear that
compilers are producing unsafe output, signed integer overflows seem
to happen, and regexec fails to find matches past offset INT_MAX.
this patch fixes the type of all variables/fields used to store
offsets in the string from int to regoff_t. after the changes, basic
testing showed that regexec can now find matches past 2GB (INT_MAX)
and past 4GB on x86_64, and code generation is unchanged on i386.
---
src/regex/regexec.c | 54 +++++++++++++++++++++++++++--------------------------
1 file changed, 28 insertions(+), 26 deletions(-)
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index dd52319..5c4cb92 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,7 +44,7 @@
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, int *tags, int match_eo);
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
/***********************************************************************
from tre-match-utils.h
@@ -97,7 +97,7 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
/* Returns 1 if `t1' wins `t2', 0 otherwise. */
static int
tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
- int *t1, int *t2)
+ regoff_t *t1, regoff_t *t2)
{
int i;
for (i = 0; i < num_tags; i++)
@@ -157,25 +157,25 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
typedef struct {
tre_tnfa_transition_t *state;
- int *tags;
+ regoff_t *tags;
} tre_tnfa_reach_t;
typedef struct {
- int pos;
- int **tags;
+ regoff_t pos;
+ regoff_t **tags;
} tre_reach_pos_t;
static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
- int *match_tags, int eflags,
- int *match_end_ofs)
+ regoff_t *match_tags, int eflags,
+ regoff_t *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
- int pos = -1;
- int pos_add_next = 1;
+ regoff_t pos = -1;
+ regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -191,10 +191,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
int *tag_i;
int num_tags, i;
- int match_eo = -1; /* end offset of match (-1 if no match found yet) */
+ regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
int new_match = 0;
- int *tmp_tags = NULL;
- int *tmp_iptr;
+ regoff_t *tmp_tags = NULL;
+ regoff_t *tmp_iptr;
#ifdef TRE_MBSTATE
memset(&mbstate, '\0', sizeof(mbstate));
@@ -214,7 +214,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
/* Ensure that tbytes and xbytes*num_states cannot overflow, and that
* they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
- if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
+ if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
goto error_exit;
/* Likewise check rbytes. */
@@ -229,7 +229,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
tbytes = sizeof(*tmp_tags) * num_tags;
rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
pbytes = sizeof(*reach_pos) * tnfa->num_states;
- xbytes = sizeof(int) * num_tags;
+ xbytes = sizeof(regoff_t) * num_tags;
total_bytes =
(sizeof(long) - 1) * 4 /* for alignment paddings */
+ (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
@@ -490,12 +490,12 @@ error_exit:
*/
typedef struct {
- int pos;
+ regoff_t pos;
const char *str_byte;
tre_tnfa_transition_t *state;
int state_id;
int next_c;
- int *tags;
+ regoff_t *tags;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -591,13 +591,13 @@ typedef struct tre_backtrack_struct {
static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- int *match_tags, int eflags, int *match_end_ofs)
+ regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
- int pos = 0;
- int pos_add_next = 1;
+ regoff_t pos = 0;
+ regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
@@ -610,15 +610,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
started from. */
int next_c_start;
const char *str_byte_start;
- int pos_start = -1;
+ regoff_t pos_start = -1;
#ifdef TRE_MBSTATE
mbstate_t mbstate_start;
#endif /* TRE_MBSTATE */
/* End offset of best match so far, or -1 if no match found yet. */
- int match_eo = -1;
+ regoff_t match_eo = -1;
/* Tag arrays. */
- int *next_tags, *tags = NULL;
+ int *next_tags;
+ regoff_t *tags = NULL;
/* Current TNFA state. */
tre_tnfa_transition_t *state;
int *states_seen = NULL;
@@ -768,8 +769,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
/* This is a back reference state. All transitions leaving from
this state have the same back reference "assertion". Instead
of reading the next character, we match the back reference. */
- int so, eo, bt = trans_i->u.backref;
- int bt_len;
+ regoff_t so, eo;
+ int bt = trans_i->u.backref;
+ regoff_t bt_len;
int result;
/* Get the substring we need to match against. Remember to
@@ -926,7 +928,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
endpoint values. */
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, int *tags, int match_eo)
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
{
tre_submatch_data_t *submatch_data;
unsigned int i, j;
@@ -996,7 +998,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
{
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
- int *tags = NULL, eo;
+ regoff_t *tags = NULL, eo;
if (tnfa->cflags & REG_NOSUB) nmatch = 0;
if (tnfa->num_tags > 0 && nmatch > 0)
{
--
cgit v0.11.2
|