| 1 | /* |
| 2 | tre-internal.h - TRE internal definitions |
| 3 | |
| 4 | This software is released under a BSD-style license. |
| 5 | See the file LICENSE for details and copyright. |
| 6 | |
| 7 | */ |
| 8 | |
| 9 | #ifndef TRE_INTERNAL_H |
| 10 | #define TRE_INTERNAL_H 1 |
| 11 | |
| 12 | #ifdef HAVE_WCHAR_H |
| 13 | #include <wchar.h> |
| 14 | #endif /* HAVE_WCHAR_H */ |
| 15 | |
| 16 | #ifdef HAVE_WCTYPE_H |
| 17 | #include <wctype.h> |
| 18 | #endif /* !HAVE_WCTYPE_H */ |
| 19 | |
| 20 | #include <ctype.h> |
| 21 | #include <bitstring.h> |
| 22 | #include "tre.h" |
| 23 | #include "xlocale_private.h" |
| 24 | |
| 25 | #ifdef TRE_DEBUG |
| 26 | #include <stdio.h> |
| 27 | #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0) |
| 28 | #else /* !TRE_DEBUG */ |
| 29 | #define DPRINT(msg) do { } while(/*CONSTCOND*/0) |
| 30 | #endif /* !TRE_DEBUG */ |
| 31 | |
| 32 | #define elementsof(x) ( sizeof(x) / sizeof(x[0]) ) |
| 33 | |
| 34 | #ifdef HAVE_MBRTOWC |
| 35 | #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) |
| 36 | #define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l))) |
| 37 | #else /* !HAVE_MBRTOWC */ |
| 38 | #ifdef HAVE_MBTOWC |
| 39 | #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) |
| 40 | #endif /* HAVE_MBTOWC */ |
| 41 | #endif /* !HAVE_MBRTOWC */ |
| 42 | |
| 43 | #ifdef TRE_MULTIBYTE |
| 44 | #ifdef HAVE_MBSTATE_T |
| 45 | #define TRE_MBSTATE |
| 46 | #endif /* TRE_MULTIBYTE */ |
| 47 | #endif /* HAVE_MBSTATE_T */ |
| 48 | |
| 49 | /* Define the character types and functions. */ |
| 50 | #ifdef TRE_WCHAR |
| 51 | |
| 52 | /* Wide characters. */ |
| 53 | typedef wint_t tre_cint_t; |
| 54 | #define TRE_CHAR_MAX WCHAR_MAX |
| 55 | |
| 56 | #ifdef TRE_MULTIBYTE |
| 57 | #define TRE_MB_CUR_MAX MB_CUR_MAX |
| 58 | #define TRE_MB_CUR_MAX_L MB_CUR_MAX_L |
| 59 | #else /* !TRE_MULTIBYTE */ |
| 60 | #define TRE_MB_CUR_MAX 1 |
| 61 | #endif /* !TRE_MULTIBYTE */ |
| 62 | |
| 63 | #define tre_isalnum iswalnum |
| 64 | #define tre_isalpha iswalpha |
| 65 | #ifdef HAVE_ISWBLANK |
| 66 | #define tre_isblank iswblank |
| 67 | #endif /* HAVE_ISWBLANK */ |
| 68 | #define tre_iscntrl iswcntrl |
| 69 | #define tre_isdigit iswdigit |
| 70 | #define tre_isgraph iswgraph |
| 71 | #define tre_islower iswlower |
| 72 | #define tre_isprint iswprint |
| 73 | #define tre_ispunct iswpunct |
| 74 | #define tre_isspace iswspace |
| 75 | #define tre_isupper iswupper |
| 76 | #define tre_isxdigit iswxdigit |
| 77 | |
| 78 | #define tre_tolower towlower |
| 79 | #define tre_toupper towupper |
| 80 | #define tre_strlen wcslen |
| 81 | |
| 82 | #define tre_isalnum_l iswalnum_l |
| 83 | #define tre_isdigit_l iswdigit_l |
| 84 | #define tre_islower_l iswlower_l |
| 85 | #define tre_isupper_l iswupper_l |
| 86 | #define tre_isxdigit_l iswxdigit_l |
| 87 | #define tre_tolower_l towlower_l |
| 88 | #define tre_toupper_l towupper_l |
| 89 | |
| 90 | #else /* !TRE_WCHAR */ |
| 91 | |
| 92 | /* 8 bit characters. */ |
| 93 | typedef short tre_cint_t; |
| 94 | #define TRE_CHAR_MAX 255 |
| 95 | #define TRE_MB_CUR_MAX 1 |
| 96 | |
| 97 | #define tre_isalnum isalnum |
| 98 | #define tre_isalpha isalpha |
| 99 | #ifdef HAVE_ISASCII |
| 100 | #define tre_isascii isascii |
| 101 | #endif /* HAVE_ISASCII */ |
| 102 | #ifdef HAVE_ISBLANK |
| 103 | #define tre_isblank isblank |
| 104 | #endif /* HAVE_ISBLANK */ |
| 105 | #define tre_iscntrl iscntrl |
| 106 | #define tre_isdigit isdigit |
| 107 | #define tre_isgraph isgraph |
| 108 | #define tre_islower islower |
| 109 | #define tre_isprint isprint |
| 110 | #define tre_ispunct ispunct |
| 111 | #define tre_isspace isspace |
| 112 | #define tre_isupper isupper |
| 113 | #define tre_isxdigit isxdigit |
| 114 | |
| 115 | #define tre_tolower(c) (tre_cint_t)(tolower(c)) |
| 116 | #define tre_toupper(c) (tre_cint_t)(toupper(c)) |
| 117 | #define tre_strlen(s) (strlen((const char*)s)) |
| 118 | |
| 119 | #endif /* !TRE_WCHAR */ |
| 120 | |
| 121 | #if defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE) |
| 122 | #define TRE_USE_SYSTEM_WCTYPE 1 |
| 123 | #endif |
| 124 | |
| 125 | #ifdef TRE_USE_SYSTEM_WCTYPE |
| 126 | /* Use system provided iswctype() and wctype(). */ |
| 127 | typedef wctype_t tre_ctype_t; |
| 128 | #define tre_isctype iswctype |
| 129 | #define tre_ctype wctype |
| 130 | #define tre_isctype_l iswctype_l |
| 131 | #define tre_ctype_l wctype_l |
| 132 | #else /* !TRE_USE_SYSTEM_WCTYPE */ |
| 133 | /* Define our own versions of iswctype() and wctype(). */ |
| 134 | typedef int (*tre_ctype_t)(tre_cint_t); |
| 135 | #define tre_isctype(c, type) ( (type)(c) ) |
| 136 | tre_ctype_t tre_ctype(const char *name); |
| 137 | #endif /* !TRE_USE_SYSTEM_WCTYPE */ |
| 138 | |
| 139 | typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t; |
| 140 | |
| 141 | /* Returns number of bytes to add to (char *)ptr to make it |
| 142 | properly aligned for the type. */ |
| 143 | #define ALIGN(ptr, type) \ |
| 144 | ((((long)ptr) % sizeof(type)) \ |
| 145 | ? (sizeof(type) - (((long)ptr) % sizeof(type))) \ |
| 146 | : 0) |
| 147 | |
| 148 | #undef MAX |
| 149 | #undef MIN |
| 150 | #define MAX(a, b) (((a) >= (b)) ? (a) : (b)) |
| 151 | #define MIN(a, b) (((a) <= (b)) ? (a) : (b)) |
| 152 | |
| 153 | /* Define STRF to the correct printf formatter for strings. */ |
| 154 | #ifdef TRE_WCHAR |
| 155 | #define STRF "ls" |
| 156 | #else /* !TRE_WCHAR */ |
| 157 | #define STRF "s" |
| 158 | #endif /* !TRE_WCHAR */ |
| 159 | |
| 160 | /* Types to handle bracket expressions. */ |
| 161 | typedef enum { |
| 162 | TRE_BRACKET_MATCH_TYPE_UNUSED = 0, |
| 163 | TRE_BRACKET_MATCH_TYPE_CHAR, /* Single character value */ |
| 164 | TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, /* Collation range begin */ |
| 165 | TRE_BRACKET_MATCH_TYPE_RANGE_END, /* Collation range end */ |
| 166 | TRE_BRACKET_MATCH_TYPE_CLASS, /* Character class */ |
| 167 | TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, /* Collation equivalence value */ |
| 168 | } tre_bracket_match_type_t; |
| 169 | |
| 170 | typedef struct { |
| 171 | tre_bracket_match_type_t type; |
| 172 | tre_cint_t value; |
| 173 | } tre_bracket_match_t; |
| 174 | |
| 175 | #define TRE_BRACKET_MATCH_FLAG_NEGATE 1 |
| 176 | |
| 177 | typedef struct { |
| 178 | int num_bracket_matches; |
| 179 | int flags; |
| 180 | tre_bracket_match_t bracket_matches[0]; |
| 181 | } tre_bracket_match_list_t; |
| 182 | |
| 183 | #define SIZEOF_BRACKET_MATCH_LIST_N(n) (sizeof(tre_bracket_match_list_t) + \ |
| 184 | sizeof(tre_bracket_match_t) * (n)) |
| 185 | #define SIZEOF_BRACKET_MATCH_LIST(l) SIZEOF_BRACKET_MATCH_LIST_N( \ |
| 186 | (l)->num_bracket_matches) |
| 187 | |
| 188 | /* The "count" field is the number of time the tag was set, initially zero. |
| 189 | The "first" field contains the first set value (when "count" equals 1). |
| 190 | The "value" field contains the current value of the tag, if "count" is |
| 191 | greater than zero (the tag's current value is -1 if "count" is zero). |
| 192 | The "touch" field is the touch value, a montonically increasing value |
| 193 | (maintained by the caller) set each time the tag itself is set. */ |
| 194 | typedef struct { |
| 195 | int count; |
| 196 | int first; |
| 197 | int value; |
| 198 | int touch; |
| 199 | } tre_tag_t; |
| 200 | |
| 201 | /* TNFA transition type. A TNFA state is an array of transitions, |
| 202 | the terminator is a transition with NULL `state'. */ |
| 203 | typedef struct tnfa_transition tre_tnfa_transition_t; |
| 204 | |
| 205 | struct tnfa_transition { |
| 206 | /* Range of accepted characters. */ |
| 207 | tre_cint_t code_min; |
| 208 | tre_cint_t code_max; |
| 209 | /* Pointer to the destination state. */ |
| 210 | tre_tnfa_transition_t *state; |
| 211 | /* ID number of the destination state. */ |
| 212 | int state_id; |
| 213 | /* -1 terminated array of tags (or NULL). */ |
| 214 | int *tags; |
| 215 | /* Matching parameters settings (or NULL). */ |
| 216 | int *params; |
| 217 | /* Assertion bitmap. */ |
| 218 | int assertions; |
| 219 | /* Assertion parameters. */ |
| 220 | union { |
| 221 | /* Bracket matches. */ |
| 222 | tre_bracket_match_list_t *bracket_match_list; |
| 223 | /* Back reference assertion. */ |
| 224 | int backref; |
| 225 | } u; |
| 226 | }; |
| 227 | |
| 228 | |
| 229 | /* Assertions. */ |
| 230 | #define ASSERT_AT_BOL 1 /* Beginning of line. */ |
| 231 | #define ASSERT_AT_EOL 2 /* End of line. */ |
| 232 | #define ASSERT_BRACKET_MATCH 4 /* Matches in `bracket_match_list'. */ |
| 233 | #define ASSERT_AT_BOW 8 /* Beginning of word. */ |
| 234 | #define ASSERT_AT_EOW 16 /* End of word. */ |
| 235 | #define ASSERT_AT_WB 32 /* Word boundary. */ |
| 236 | #define ASSERT_AT_WB_NEG 64 /* Not a word boundary. */ |
| 237 | #define ASSERT_BACKREF 128 /* A back reference in `backref'. */ |
| 238 | #define ASSERT_LAST 128 |
| 239 | |
| 240 | /* Tag directions. */ |
| 241 | typedef enum { |
| 242 | TRE_TAG_MINIMIZE = 0, |
| 243 | TRE_TAG_MAXIMIZE, |
| 244 | TRE_TAG_LEFT_MAXIMIZE, |
| 245 | } tre_tag_direction_t; |
| 246 | |
| 247 | /* Parameters that can be changed dynamically while matching. */ |
| 248 | typedef enum { |
| 249 | TRE_PARAM_COST_INS = 0, |
| 250 | TRE_PARAM_COST_DEL = 1, |
| 251 | TRE_PARAM_COST_SUBST = 2, |
| 252 | TRE_PARAM_COST_MAX = 3, |
| 253 | TRE_PARAM_MAX_INS = 4, |
| 254 | TRE_PARAM_MAX_DEL = 5, |
| 255 | TRE_PARAM_MAX_SUBST = 6, |
| 256 | TRE_PARAM_MAX_ERR = 7, |
| 257 | TRE_PARAM_DEPTH = 8, |
| 258 | TRE_PARAM_LAST = 9 |
| 259 | } tre_param_t; |
| 260 | |
| 261 | /* Unset matching parameter */ |
| 262 | #define TRE_PARAM_UNSET -1 |
| 263 | |
| 264 | /* Signifies the default matching parameter value. */ |
| 265 | #define TRE_PARAM_DEFAULT -2 |
| 266 | |
| 267 | /* Instructions to compute submatch register values from tag values |
| 268 | after a successful match. */ |
| 269 | struct tre_submatch_data { |
| 270 | /* Tag that gives the value for rm_so (submatch start offset). */ |
| 271 | int so_tag; |
| 272 | /* Tag that gives the value for rm_eo (submatch end offset). */ |
| 273 | int eo_tag; |
| 274 | }; |
| 275 | |
| 276 | typedef struct tre_submatch_data tre_submatch_data_t; |
| 277 | |
| 278 | |
| 279 | /* LAST MATCHED structures */ |
| 280 | struct __previous_type; |
| 281 | struct __previous_branch_type; |
| 282 | struct __current_type; |
| 283 | struct __current_branch_type; |
| 284 | |
| 285 | typedef struct __previous_branch_type { |
| 286 | struct __previous_branch_type *next; |
| 287 | struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; |
| 288 | int tot_branches; |
| 289 | int tot_last_matched; |
| 290 | int tot_tags; |
| 291 | bitstr_t tags[0]; |
| 292 | } tre_last_matched_branch_pre_t; |
| 293 | |
| 294 | typedef struct __previous_type { |
| 295 | struct __previous_type *next; |
| 296 | tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag; |
| 297 | int tot_branches; |
| 298 | int tot_last_matched; |
| 299 | int tot_tags; |
| 300 | } tre_last_matched_pre_t; |
| 301 | |
| 302 | typedef struct __current_branch_type { |
| 303 | int *tags; |
| 304 | struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; |
| 305 | } tre_last_matched_branch_t; |
| 306 | |
| 307 | typedef struct __current_type { |
| 308 | tre_last_matched_branch_t *branches; int n_branches; int start_tag; |
| 309 | } tre_last_matched_t; |
| 310 | |
| 311 | /* TNFA definition. */ |
| 312 | typedef struct tnfa tre_tnfa_t; |
| 313 | |
| 314 | struct tnfa { |
| 315 | tre_tnfa_transition_t *transitions; |
| 316 | tre_tnfa_transition_t *initial; |
| 317 | tre_tnfa_transition_t *final; |
| 318 | tre_submatch_data_t *submatch_data; |
| 319 | tre_tag_direction_t *tag_directions; |
| 320 | int *minimal_tags; |
| 321 | tre_last_matched_branch_t *last_matched_branch; |
| 322 | locale_t loc; |
| 323 | unsigned int num_transitions; |
| 324 | int first_char; |
| 325 | unsigned int num_submatches; |
| 326 | unsigned int num_submatches_invisible; |
| 327 | int num_tags; |
| 328 | int num_minimals; |
| 329 | int end_tag; |
| 330 | int num_states; |
| 331 | int cflags; |
| 332 | int have_backrefs; |
| 333 | int num_reorder_tags; |
| 334 | int have_approx; |
| 335 | int params_depth; |
| 336 | }; |
| 337 | |
| 338 | int |
| 339 | tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, |
| 340 | locale_t loc); |
| 341 | |
| 342 | void |
| 343 | tre_free(regex_t *preg); |
| 344 | |
| 345 | reg_errcode_t |
| 346 | tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, |
| 347 | const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo); |
| 348 | |
| 349 | reg_errcode_t |
| 350 | tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, |
| 351 | tre_str_type_t type, tre_tag_t *match_tags, int eflags, |
| 352 | int *match_end_ofs); |
| 353 | |
| 354 | reg_errcode_t |
| 355 | tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, |
| 356 | int len, tre_str_type_t type, tre_tag_t *match_tags, |
| 357 | int eflags, int *match_end_ofs); |
| 358 | |
| 359 | #ifdef TRE_APPROX |
| 360 | reg_errcode_t |
| 361 | tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len, |
| 362 | tre_str_type_t type, tre_tag_t *match_tags, |
| 363 | regamatch_t *match, regaparams_t params, |
| 364 | int eflags, int *match_end_ofs); |
| 365 | #endif /* TRE_APPROX */ |
| 366 | |
| 367 | #endif /* TRE_INTERNAL_H */ |
| 368 | |
| 369 | /* EOF */ |