Perl · khwilliamson · Aug 17, 2025 · Sep 5, 2025 · Sep 5, 2025 · Oct 27, 2025
diff --git a/toke.c b/toke.c
@@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e,
      * written, and regcurly never required a comma, as in {0}.  Probably it is
      * ok as-is */
     if (s[0] == '{') {
-        if (regcurly(s, e, NULL)) {
-            return FALSE;
-        }
-        return TRUE;
+        return ! regcurly(s, e, NULL);
     }
 
     /* Here is '[': maybe we have a character class.  Examine the guts */
@@ -4735,72 +4732,147 @@ S_intuit_more(pTHX_ char *s, char *e,
              * looks for.
              *
              */
-            if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
-                Size_t len;
 
-                /* khw: where did the magic number 4 come from?.  This buffer
-                 * was 4 times as large as tokenbuf in 1997, and had not
-                 * changed since the code was first added */
-                char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
+            /* (Reserve tmpbuf[0] for future commits, hence +1) */
+            char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ];
+            char * s_after_ident;
+
+            /* scan_ident returns NULL if the input looks like an identifier
+             * that is illegal, e.g., it is too long or is like $001. */
+            s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
+                                       CHECK_ONLY);
+            if (s_after_ident == NULL) {
+
+                /* An illegal identifier means this can't be a subscript;
+                 * it's an error or it could be a charclass */
+                return false;
+            }
+
+            Size_t len; /* (C++ forbids joining these 2 lines) */
+            len = strlen(tmpbuf + 1);
+
+            /* If it doesn't look like an identifier at all, scan_ident will
+             * set tmpbuf[1] to NUL.  This is either an error or a character
+             * class. */
+            if (len == 0) {
+                return false;
+            }
+
+            /* If there is extra stuff in the source, like braces, it means
+             * this is almost definitely intended to be an identifier */
+            bool decorated;
+            decorated = (Size_t) (s_after_ident - s) > len;
+
+            if (isDIGIT_A(tmpbuf[1])) {
+
+                /* &41 and &4b are illegal subroutine names so is an error or
+                 * a charclass */
+                if (s[0] == '&') {
+                    return false;
+                }
 
-                if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY))
+                /* Here, matches [$@]\d+.  If the next input character is a
+                 * \w, we would have something like $456x, which is an illegal
+                 * identifer, so is an error or a charclass */
+                if ( ! decorated
+                    && isWORDCHAR_lazy_if_safe(s_after_ident,
+                                               PL_bufend, UTF))
                 {
-                    /* An illegal identifier means this can't be a subscript;
-                     * it's an error or it could be a charclass */
                     return false;
                 }
 
-                len = strlen(tmpbuf);
+                /* We don't get here if this potential identifier starts with
+                 * leading zeros, due to the logic in scan_ident. */
+                assert(len == 1 || tmpbuf[0] != '0');
 
-                /* khw: This only looks at global variables; lexicals came
-                 * later, and this hasn't been updated.  Ouch!! */
-                if (   len > 1
-                    && gv_fetchpvn_flags(tmpbuf,
-                                         len,
-                                         UTF ? SVf_UTF8 : 0,
-                                         SVt_PV))
+                /* The chances are vanishingly small that someone is going to
+                 * want [$0] to expand to the program's name in a character
+                 * class.  But, what would the program's name be doing as part
+                 * of a subscript either?  The only likely scenario is that
+                 * this is meant to be a charclass matching either '$' or '0'.
+                 * */
+                if (tmpbuf[1] == '0') {
+                    return false;
+                }
+
+                /* Here it is either something like $1 which is supposed to
+                 * match either dollar or 1, or it is supposed to expand to
+                 * what is in $1 left over from a capturing group from the
+                 * previous pattern match.  In the latter case, it could be
+                 * either a part of wanting to calculate a subscript, or to
+                 * use as the contents of as part of the character class.
+                 * Larger (undecorated) numbers are much less likely to have
+                 * had capturing groups, so they lean more towards a
+                 * charclass.  100 is what this function has traditionally
+                 * used for len>1; khw thinks there is no bias one way or the
+                 * other for length 1 ones.  But has chosen 100 for decorated
+                 * identifiers
+                 *
+                 * XXX long enough identifiers could probably return false
+                 * immediately here, rather than using weights. */
+                if (decorated || len > 1) {
+                        weight -= 100;
+                }
+            }
+            else if (len > 1) {
+                /* See if there is a known identifier of the given kind.  For
+                 * arrays, this might also be a reference to one of its
+                 * elements.   XXX Maybe the latter should require a following
+                 * '[' */
+                if (   is_existing_identifier(tmpbuf, len, s[0], UTF)
+                    || (   s[0] == '$'
+                        && is_existing_identifier(tmpbuf, len, '@', UTF)))
                 {
                     weight -= 100;
 
-                    /* khw: Below we keep track of repeated characters;  People
-                     * rarely say qr/[aba]/, as the second a is pointless.
-                     * (Some do it though as a mnemonic that is meaningful to
-                     * them.)  But generally, repeated characters make things
-                     * more likely to be a charclass.  But here, this an
-                     * identifier so likely a subscript.  Its spelling should
-                     * be irrelevant to the repeated characters test.  So, we
-                     * should advance past it.  Suppose it is a hash element,
-                     * like $subscripts{$which}.  We should advance past the
-                     * braces and key */
+                    /* khw: Below we keep track of repeated characters;
+                     * People rarely say qr/[aba]/, as the second a is
+                     * pointless.  (Some do it though as a mnemonic that is
+                     * meaningful to them.) But generally, repeated characters
+                     * make things more likely to be a charclass.  But here,
+                     * this an identifier so likely a subscript.  Its spelling
+                     * should be irrelevant to the repeated characters test.
+                     * So, we should advance past it.  Suppose it is a hash
+                     * element, like $subscripts{$which}.  We should advance
+                     * past the braces and key */
                 }
-                else {
-                    /* Not a multi-char identifier already known in the
-                     * program; is somewhat likely to be a subscript.
-                     *
-                     * khw: Our test suite contains several constructs like
-                     * [$A-Z].  Excluding length 1 identifiers in the
-                     * conditional above means such are much less likely to be
-                     * mistaken for subscripts.  I would argue that if the next
-                     * character is a '-' followed by an alpha, that would make
-                     * it much more likely to be a charclass.  It would only
-                     * make sense to be an expression if that alpha string is a
-                     * bareword with meaning; something like [$A-ord] */
+                else {  /* Isn't a known identifier */
+                    /* Under strict, this means an error. */
+                    if (under_strict_vars) {
+                        return false;
+                    }
+
+                    /* Otherwise still somewhat likely to be a subscript */
                     weight -= 10;
                 }
             }
-            else if (   s[0] == '$'
-                     && s[1]
-                     && memCHRs("[#!%*<>()-=", s[1]))
+            else if (   len == 1
+                     && s[0] == '$'
+                     && memCHRs("[#!%*<>()-=", tmpbuf[1]))
             {
                 /* Here we have what could be a punctuation variable.  If the
                  * next character after it is a closing bracket, it makes it
                  * quite likely to be that, and hence a subscript.  If it is
                  * something else, more mildly a subscript */
-                if (/*{*/ memCHRs("])} =", s[2]))
+                if (/*{*/ memCHRs("])} =", tmpbuf[2]))
                     weight -= 10;
                 else
                     weight -= 1;
             }
+            else {
+                /* Not a multi-char identifier already known in the program;
+                 * is somewhat likely to be a subscript.
+                 *
+                 * khw: Our test suite contains several constructs like
+                 * [$A-Z].  Excluding length 1 identifiers in the conditional
+                 * above means such are much less likely to be mistaken for
+                 * subscripts.  I would argue that if the next character is a
+                 * '-' followed by an alpha, that would make it much more
+                 * likely to be a charclass.  It would only make sense to be
+                 * an expression if that alpha string is a bareword with
+                 * meaning; something like [$A-ord] */
+                weight -= 10;
+            }
             break;
 
           /* khw:  [:blank:] strongly indicates a charclass */
@@ -4812,66 +4884,64 @@ S_intuit_more(pTHX_ char *s, char *e,
            *      \? must be subscript for things like \d, but not \a.
            */
 
-
           case '\\':
-            if (s[1]) {
-                if (memCHRs("wds]", s[1])) {
-                    weight += 100;  /* \w \d \s => strongly charclass */
-                    /* khw: \] can't happen, as any ']' is beyond our search.
-                     * Why not \W \D \S \h \v, etc as well?  Should they have
-                     * the same weights as \w \d \s or should all or some be
-                     * in the 'abcfnrtvx' below? */
-                } else if (seen[(U8)'\''] || seen[(U8)'"']) {
-                    weight += 1;
-                    /* khw: This is problematic.  Enough so, that I misread
-                     * it, and added a wrong comment about what it does in
-                     * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
-                     * doesn't look at the current character.  What it
-                     * actually does is: if any quote has been seen in the
-                     * parse, don't do the rest of the else's below, but for
-                     * every subsequent backslashed character encountered
-                     * (except \0 \w \s \d), increment the weight to lean a
-                     * bit more towards being a charclass.  That means that
-                     * every backslash sequence following the first occurrence
-                     * of a quote increments the weight regardless of what the
-                     * sequence is.  Again, \0 \w \d and \s are not controlled
-                     * by this else, so they change the weight by a lot more.
-                     * But what makes them so special that they aren't subject
-                     * to this.  Any why does having a quote change the
-                     * behavior from then on.  And why only backslashed
-                     * sequences get this treatment?  This code has been
-                     * unchanged since this function was added in 1993.  I
-                     * don't get it.  Instead, it does seem to me that it is
-                     * especially unlikely to repeat a quote in a charclass,
-                     * but that having just a single quote is indicative of a
-                     * charclass, and having pairs of quotes is indicative of
-                     * a subscript.  Similarly for things that could indicate
-                     * nesting of braces or parens. */
-                }
-                else if (memCHRs("abcfnrtvx", s[1]))
-                    weight += 40;   /* \n, etc => charclass */
-                    /* khw: Why not \e etc as well? */
-                else if (isDIGIT(s[1])) {
-                    weight += 40;   /* \123 => charclass */
-                    while (s[1] && isDIGIT(s[1]))
-                        s++;
-                }
-
-                /* khw: There are lots more possible escape sequences.  Some,
-                 * like \A,\z have no special meaning to charclasses, so might
-                 * indicate a subscript, but I don't know what they would be
-                 * doing there either.  Some have been added to the language
-                 * after this code was written, but no one thought to, or
-                 * could wade through this function, to add them.  Things like
-                 * \p{} for properties, \N and \N{}, for example.
-                 *
-                 * It's problematic that \a is treated as plain 'a' for
-                 * purposes of the 'seen' array.  Whatever is matched by these
-                 * backslashed sequences should not be added to 'seen'.  That
-                 * includes the backslash. */
-            }
-            else /* \ followed by NUL strongly indicates character class */
+            if (s[1] == '\0') {
+                /* \ followed by NUL strongly indicates character class */
                 weight += 100;
+            }
+            else if (memCHRs("wds]", s[1])) {
+                weight += 100;  /* \w \d \s => strongly charclass */
+                /* khw: \] can't happen, as any ']' is beyond our search.  Why
+                 * not \W \D \S \h \v, etc as well?  Should they have the same
+                 * weights as \w \d \s or should all or some be in the
+                 * 'abcfnrtvx' below? */
+            }
+            else if (seen[(U8)'\''] || seen[(U8)'"']) {
+                weight += 1;
+                /* khw: This is problematic.  Enough so, that I misread it,
+                 * and added a wrong comment about what it does in
+                 * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
+                 * doesn't look at the current character.  What it actually
+                 * does is: if any quote has been seen in the parse, don't do
+                 * the rest of the else's below, but for every subsequent
+                 * backslashed character encountered (except \0 \w \s \d),
+                 * increment the weight to lean a bit more towards being a
+                 * charclass.  That means that every backslash sequence
+                 * following the first occurrence of a quote increments the
+                 * weight regardless of what the sequence is.  Again, \0 \w \d
+                 * and \s are not controlled by this else, so they change the
+                 * weight by a lot more.  But what makes them so special that
+                 * they aren't subject to this.  Any why does having a quote
+                 * change the behavior from then on.  And why only backslashed
+                 * sequences get this treatment?  This code has been unchanged
+                 * since this function was added in 1993.  I don't get it.
+                 * Instead, it does seem to me that it is especially unlikely
+                 * to repeat a quote in a charclass, but that having just a
+                 * single quote is indicative of a charclass, and having pairs
+                 * of quotes is indicative of a subscript.  Similarly for
+                 * things that could indicate nesting of braces or parens. */
+            }
+            else if (memCHRs("abcfnrtvx", s[1]))
+                weight += 40;   /* \n, etc => charclass */
+                /* khw: Why not \e etc as well? */
+            else if (isDIGIT(s[1])) {
+                weight += 40;   /* \123 => charclass */
+                while (s[1] && isDIGIT(s[1]))
+                    s++;
+            }
+
+            /* khw: There are lots more possible escape sequences.  Some, like
+             * \A,\z have no special meaning to charclasses, so might indicate
+             * a subscript, but I don't know what they would be doing there
+             * either.  Some have been added to the language after this code
+             * was written, but no one thought to, or could wade through this
+             * function, to add them.  Things like \p{} for properties, \N and
+             * \N{}, for example.
+             *
+             * It's problematic that \a is treated as plain 'a' for purposes
+             * of the 'seen' array.  Whatever is matched by these backslashed
+             * sequences should not be added to 'seen'.  That includes the
+             * backslash. */
             break;
 
           case '-':