diff --git a/configure.ac b/configure.ac index 4ad9307a..bdbf7701 100644 --- a/configure.ac +++ b/configure.ac @@ -455,49 +455,49 @@ have_regex_posix=0 need_regex_pcre=0 need_regex_posix=0 -AC_ARG_ENABLE(pcre, - [ --disable-pcre Disable PCRE support (default auto)], - [enable_pcre=$enableval], [enable_pcre="auto"]) +AC_ARG_ENABLE(pcre2, + [ --disable-pcre2 Disable PCRE2 support (default auto)], + [enable_pcre2=$enableval], [enable_pcre2="auto"]) -if test "x$enable_pcre" != "xno" ; then +if test "x$enable_pcre2" != "xno" ; then - AC_ARG_WITH(pcre-config, [ --with-pcre-config=PATH Location of PCRE pcre-config (auto)], pcre_config="$withval", pcre_config="") + AC_ARG_WITH(pcre2-config, [ --with-pcre2-config=PATH Location of PCRE2 pcre2-config (auto)], pcre2_config="$withval", pcre2_config="") - if test "X$pcre_config" != "Xno" ; then - if test "X$pcre_config" != "X" ; then - AC_MSG_CHECKING(for $pcre_config) + if test "X$pcre2_config" != "Xno" ; then + if test "X$pcre2_config" != "X" ; then + AC_MSG_CHECKING(for $pcre2_config) - if test -x $pcre_config ; then - PCRE_CONFIG=$pcre_config + if test -x $pcre2_config ; then + PCRE2_CONFIG=$pcre2_config AC_MSG_RESULT(yes) else - AC_MSG_ERROR([pcre-config not found at specified path $pcre_config]) + AC_MSG_ERROR([pcre2-config not found at specified path $pcre2_config]) fi fi - if test "X$PCRE_CONFIG" = "X" ; then - AC_CHECK_PROGS(PCRE_CONFIG, pcre-config) + if test "X$PCRE2_CONFIG" = "X" ; then + AC_CHECK_PROGS(PCRE2_CONFIG, pcre2-config) fi fi - AC_MSG_CHECKING(for pcre) - PCRE_VERSION=`$PCRE_CONFIG --version 2>/dev/null` - PCRE_MIN_VERSION=3.9 + AC_MSG_CHECKING(for pcre2) + PCRE2_VERSION=`$PCRE2_CONFIG --version 2>/dev/null` + PCRE2_MIN_VERSION=10.0 - PCRE_VERSION_DEC=`echo $PCRE_VERSION | $AWK -F. '{printf("%d\n", 100*$1 + $2)};'` - PCRE_MIN_VERSION_DEC=`echo $PCRE_MIN_VERSION | $AWK -F. '{printf("%d\n", 100*$1 + $2)};'` - if test "X$PCRE_VERSION" = X; then + PCRE2_VERSION_DEC=`echo $PCRE2_VERSION | $AWK -F. '{printf("%d\n", 100*$1 + $2)};'` + PCRE2_MIN_VERSION_DEC=`echo $PCRE2_MIN_VERSION | $AWK -F. '{printf("%d\n", 100*$1 + $2)};'` + if test "X$PCRE2_VERSION" = X; then AC_MSG_RESULT(not present) - elif test "X$PCRE_VERSION" -a $PCRE_VERSION_DEC -ge $PCRE_MIN_VERSION_DEC; then - have_regex_pcre=1 - AC_MSG_RESULT($PCRE_VERSION) + elif test "X$PCRE2_VERSION" -a $PCRE2_VERSION_DEC -ge $PCRE2_MIN_VERSION_DEC; then + have_regex_pcre2=1 + AC_MSG_RESULT($PCRE2_VERSION) else - AC_MSG_WARN($PCRE_VERSION - too old - need $PCRE_MIN_VERSION) + AC_MSG_WARN($PCRE2_VERSION - too old - need $PCRE2_MIN_VERSION) fi - if test $have_regex_pcre = 1; then - AC_DEFINE(HAVE_REGEX_PCRE, 1, [have PCRE regex - Perl Compatible Regular Expressions]) - elif test "x$enable_pcre" = "xyes"; then - AC_MSG_ERROR(PCRE requested but not found) + if test $have_regex_pcre2 = 1; then + AC_DEFINE(HAVE_REGEX_PCRE2, 1, [have PCRE2 regex - Perl Compatible Regular Expressions]) + elif test "x$enable_pcre2" = "xyes"; then + AC_MSG_ERROR(PCRE2 requested but not found) fi fi @@ -524,14 +524,14 @@ fi -AC_ARG_WITH(regex-library, [ --with-regex-library=NAME Use regex library - posix, pcre (auto)], regex_library="$withval", regex_library="") +AC_ARG_WITH(regex-library, [ --with-regex-library=NAME Use regex library - posix, pcre2 (auto)], regex_library="$withval", regex_library="") -for regex_library_name in $regex_library pcre posix; do +for regex_library_name in $regex_library pcre2 posix; do case $regex_library_name in - pcre) - if test $have_regex_pcre = 1; then - need_regex_pcre=1 - AC_DEFINE(RASQAL_REGEX_PCRE, 1, [Use PCRE regex library]) + pcre2) + if test $have_regex_pcre2 = 1; then + need_regex_pcre2=1 + AC_DEFINE(RASQAL_REGEX_PCRE2, 1, [Use PCRE2 regex library]) break fi ;; @@ -554,14 +554,14 @@ done AC_MSG_CHECKING(regex library to use) regex_library= -if test $need_regex_pcre = 1; then - regex_library=pcre +if test $need_regex_pcre2 = 1; then + regex_library=pcre2 elif test $need_regex_posix = 1; then regex_library=posix else regex_library=none AC_DEFINE(RASQAL_REGEX_NONE, 1, [Use no regex library]) - AC_MSG_WARN(No regex library available - please install pcre or a POSIX regex library) + AC_MSG_WARN(No regex library available - please install pcre2 or a POSIX regex library) fi AC_MSG_RESULT($regex_library) @@ -918,9 +918,9 @@ RASQAL_INTERNAL_CPPFLAGS="$RASQAL_INTERNAL_CPPFLAGS $RAPTOR2_CFLAGS" RASQAL_EXTERNAL_LIBS="$RASQAL_EXTERNAL_LIBS $RAPTOR2_LIBS" -if test $need_regex_pcre = 1; then - C=`$PCRE_CONFIG --cflags` - L=`$PCRE_CONFIG --libs` +if test $need_regex_pcre2 = 1; then + C=`$PCRE2_CONFIG --cflags` + L=`$PCRE2_CONFIG --libs8` RASQAL_INTERNAL_CPPFLAGS="$RASQAL_INTERNAL_CPPFLAGS $C" RASQAL_EXTERNAL_LIBS="$RASQAL_EXTERNAL_LIBS $L" diff --git a/src/rasqal_regex.c b/src/rasqal_regex.c index 698c80fd..979f89d8 100644 --- a/src/rasqal_regex.c +++ b/src/rasqal_regex.c @@ -37,8 +37,9 @@ #endif #include -#ifdef RASQAL_REGEX_PCRE -#include +#ifdef RASQAL_REGEX_PCRE2 +#define PCRE2_CODE_UNIT_WIDTH 8 +#include #endif #ifdef RASQAL_REGEX_POSIX @@ -81,12 +82,13 @@ rasqal_regex_match(rasqal_world* world, raptor_locator* locator, { int flag_i = 0; /* regex_flags contains i */ const char *p; -#ifdef RASQAL_REGEX_PCRE - pcre* re; - int compile_options = PCRE_UTF8; - int exec_options = 0; - const char *re_error = NULL; - int erroffset = 0; +#ifdef RASQAL_REGEX_PCRE2 + pcre2_code* re; + int compile_options = PCRE2_UTF; + int match_options = 0; + int re_error = 0; + PCRE2_SIZE erroffset = 0; + pcre2_match_data * md; #endif #ifdef RASQAL_REGEX_POSIX regex_t reg; @@ -99,35 +101,37 @@ rasqal_regex_match(rasqal_world* world, raptor_locator* locator, if(*p == 'i') flag_i++; -#ifdef RASQAL_REGEX_PCRE +#ifdef RASQAL_REGEX_PCRE2 if(flag_i) - compile_options |= PCRE_CASELESS; + compile_options |= PCRE2_CASELESS; - re = pcre_compile(RASQAL_GOOD_CAST(const char*, pattern), compile_options, - &re_error, &erroffset, NULL); + re = pcre2_compile(RASQAL_GOOD_CAST(const char*, pattern), PCRE2_ZERO_TERMINATED, + compile_options, &re_error, &erroffset, NULL); if(!re) { rasqal_log_error_simple(world, RAPTOR_LOG_LEVEL_ERROR, locator, - "Regex compile of '%s' failed - %s", pattern, re_error); + "Regex compile of '%s' failed - %d", pattern, re_error); rc = -1; } else { - rc = pcre_exec(re, - NULL, /* no study */ - subject, - RASQAL_BAD_CAST(int, subject_len), /* PCRE API is an int */ - 0 /* startoffset */, - exec_options /* options */, - NULL, 0 /* ovector, ovecsize - no matches wanted */ - ); + md = pcre2_match_data_create_from_pattern(re,NULL); + rc = pcre2_match(re, + subject, + subject_len, + 0 /* startoffset */, + match_options /* options */, + md, + NULL + ); if(rc >= 0) rc = 1; - else if(rc != PCRE_ERROR_NOMATCH) { + else if(rc != PCRE2_ERROR_NOMATCH) { rasqal_log_error_simple(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Regex match failed - returned code %d", rc); rc= -1; } else rc = 0; } - pcre_free(re); + pcre2_match_data_free(md); + pcre2_code_free(re); #endif @@ -206,17 +210,16 @@ rasqal_regex_get_ref_number(const char **str) } -#ifdef RASQAL_REGEX_PCRE +#ifdef RASQAL_REGEX_PCRE2 static char* rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, - pcre* re, int options, + pcre2_code* re, int options, const char *subject, size_t subject_len, const char *replace, size_t replace_len, size_t *result_len_p) { - int capture_count; - int *ovector = NULL; - int ovecsize; + pcre2_match_data* md; + PCRE2_SIZE *ovector = NULL; size_t startoffset; int matched_empty_options; char *result = NULL; @@ -224,14 +227,6 @@ rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, size_t result_len; /* used size of result */ const char *replace_end = replace + replace_len; - if(pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &capture_count) < 0) - goto failed; - - ovecsize = (capture_count + 1) * 3; /* +1 for whole pattern match pair */ - ovector = RASQAL_CALLOC(int *, RASQAL_GOOD_CAST(size_t, ovecsize), sizeof(int)); - if(!ovector) - goto failed; - result_size = subject_len << 1; result = RASQAL_MALLOC(char*, result_size + 1); if(!result) @@ -245,27 +240,21 @@ rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, int stringcount; const char *subject_piece = subject + startoffset; - stringcount = pcre_exec(re, - NULL, /* no study */ - subject, - RASQAL_BAD_CAST(int, subject_len), /* PCRE API is an int */ - RASQAL_BAD_CAST(int, startoffset), - options | matched_empty_options, - ovector, ovecsize); + md = pcre2_match_data_create_from_pattern(re,NULL); + stringcount = pcre2_match(re, + subject, + subject_len, + startoffset, + options | matched_empty_options, + md, NULL); - /* "The value returned by pcre_exec() is one more than the + /* "The value returned by pcre2_match() is one more than the * highest numbered pair that has been set. ... If there are no * capturing subpatterns, the return value from a successful * match is 1, indicating that just the first pair of offsets has - * been set." - pcreapi + * been set." - pcre2api */ - if(!stringcount) - /* ovector was too small - how can this happen?. Use all - * the variables available. Should return an warning? FIXME - */ - stringcount = ovecsize / 3; - if(stringcount > 0) { /* matches have been found */ @@ -275,6 +264,7 @@ rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, const char *replace_p; char last_char; char *result_p; + ovector = pcre2_get_ovector_pointer(md); subject_match = subject + ovector[0]; @@ -376,18 +366,18 @@ rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, /* * "It is possible to emulate Perl's behaviour after matching a * null string by first trying the match again at the same - * offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and then if + * offset with PCRE2_NOTEMPTY and PCRE2_ANCHORED, and then if * that fails by advancing the starting offset ... and trying * an ordinary match again." - pcreapi * * The 'and then if' part is implemented by the if() inside - * the if(stringcount == PCRE_ERROR_NOMATCH) below. + * the if(stringcount == PCRE2_ERROR_NOMATCH) below. * */ matched_empty_options = (ovector[0] == ovector[1]) ? - (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0; + (PCRE2_NOTEMPTY | PCRE2_ANCHORED) : 0; - } else if(stringcount == PCRE_ERROR_NOMATCH) { + } else if(stringcount == PCRE2_ERROR_NOMATCH) { /* No match */ size_t piece_len; size_t new_result_len; @@ -428,13 +418,12 @@ rasqal_regex_replace_pcre(rasqal_world* world, raptor_locator* locator, break; } else { /* stringcount < 0 : other failures */ - RASQAL_DEBUG2("pcre_exec() failed with code %d\n", stringcount); + RASQAL_DEBUG2("pcre2_match() failed with code %d\n", stringcount); goto failed; } + pcre2_match_data_free(md); } - RASQAL_FREE(int*, ovector); - if(result_len_p) *result_len_p = result_len; @@ -698,12 +687,12 @@ rasqal_regex_replace(rasqal_world* world, raptor_locator* locator, size_t* result_len_p) { const char *p; -#ifdef RASQAL_REGEX_PCRE - pcre* re; - int compile_options = PCRE_UTF8; +#ifdef RASQAL_REGEX_PCRE2 + pcre2_code* re; + int compile_options = PCRE2_UTF; int exec_options = 0; - const char *re_error = NULL; - int erroffset = 0; + int re_error = 0; + PCRE2_SIZE erroffset = 0; #endif #ifdef RASQAL_REGEX_POSIX regex_t reg; @@ -715,24 +704,24 @@ rasqal_regex_replace(rasqal_world* world, raptor_locator* locator, #endif char *result_s = NULL; -#ifdef RASQAL_REGEX_PCRE +#ifdef RASQAL_REGEX_PCRE2 for(p = regex_flags; p && *p; p++) { if(*p == 'i') - exec_options |= PCRE_CASELESS; + exec_options |= PCRE2_CASELESS; } - re = pcre_compile(pattern, compile_options, + re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, compile_options, &re_error, &erroffset, NULL); if(!re) { rasqal_log_error_simple(world, RAPTOR_LOG_LEVEL_ERROR, locator, - "Regex compile of '%s' failed - %s", pattern, re_error); + "Regex compile of '%s' failed - %d", pattern, re_error); } else result_s = rasqal_regex_replace_pcre(world, locator, re, exec_options, subject, subject_len, replace, replace_len, result_len_p); - pcre_free(re); + pcre2_code_free(re); #endif #ifdef RASQAL_REGEX_POSIX @@ -794,7 +783,7 @@ main(int argc, char *argv[]) { rasqal_world* world; const char *program = rasqal_basename(argv[0]); -#ifdef RASQAL_REGEX_PCRE +#ifdef RASQAL_REGEX_PCRE2 raptor_locator* locator = NULL; int test = 0; #endif @@ -813,7 +802,7 @@ main(int argc, char *argv[]) program); #endif -#ifdef RASQAL_REGEX_PCRE +#ifdef RASQAL_REGEX_PCRE2 for(test = 0; test < NTESTS; test++) { const char* regex_flags = ""; const char* subject = "abcd1234-^";