Skip to content
Open
280 changes: 175 additions & 105 deletions toke.c
Original file line number Diff line number Diff line change
Expand Up @@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e,
* written, and regcurly never required a comma, as in {0}. Probably it is
* ok as-is */
if (s[0] == '{') {
if (regcurly(s, e, NULL)) {
return FALSE;
}
return TRUE;
return ! regcurly(s, e, NULL);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great change. Former pattern appears to be used in other places, marking such cleanup as my next TODO once my current PR is merged.

}

/* Here is '[': maybe we have a character class. Examine the guts */
Expand Down Expand Up @@ -4735,72 +4732,147 @@ S_intuit_more(pTHX_ char *s, char *e,
* looks for.
*
*/
if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
Size_t len;

/* khw: where did the magic number 4 come from?. This buffer
* was 4 times as large as tokenbuf in 1997, and had not
* changed since the code was first added */
char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
/* (Reserve tmpbuf[0] for future commits, hence +1) */
char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ];
char * s_after_ident;

/* scan_ident returns NULL if the input looks like an identifier
* that is illegal, e.g., it is too long or is like $001. */
s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
CHECK_ONLY);
if (s_after_ident == NULL) {

/* An illegal identifier means this can't be a subscript;
* it's an error or it could be a charclass */
return false;
}

Size_t len; /* (C++ forbids joining these 2 lines) */
len = strlen(tmpbuf + 1);

/* If it doesn't look like an identifier at all, scan_ident will
* set tmpbuf[1] to NUL. This is either an error or a character
* class. */
if (len == 0) {
return false;
}

/* If there is extra stuff in the source, like braces, it means
* this is almost definitely intended to be an identifier */
bool decorated;
decorated = (Size_t) (s_after_ident - s) > len;

if (isDIGIT_A(tmpbuf[1])) {

/* &41 and &4b are illegal subroutine names so is an error or
* a charclass */
if (s[0] == '&') {
return false;
}

if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY))
/* Here, matches [$@]\d+. If the next input character is a
* \w, we would have something like $456x, which is an illegal
* identifer, so is an error or a charclass */
if ( ! decorated
&& isWORDCHAR_lazy_if_safe(s_after_ident,
PL_bufend, UTF))
{
/* An illegal identifier means this can't be a subscript;
* it's an error or it could be a charclass */
return false;
}

len = strlen(tmpbuf);
/* We don't get here if this potential identifier starts with
* leading zeros, due to the logic in scan_ident. */
assert(len == 1 || tmpbuf[0] != '0');

/* khw: This only looks at global variables; lexicals came
* later, and this hasn't been updated. Ouch!! */
if ( len > 1
&& gv_fetchpvn_flags(tmpbuf,
len,
UTF ? SVf_UTF8 : 0,
SVt_PV))
/* The chances are vanishingly small that someone is going to
* want [$0] to expand to the program's name in a character
* class. But, what would the program's name be doing as part
* of a subscript either? The only likely scenario is that
* this is meant to be a charclass matching either '$' or '0'.
* */
if (tmpbuf[1] == '0') {
return false;
}

/* Here it is either something like $1 which is supposed to
* match either dollar or 1, or it is supposed to expand to
* what is in $1 left over from a capturing group from the
* previous pattern match. In the latter case, it could be
* either a part of wanting to calculate a subscript, or to
* use as the contents of as part of the character class.
* Larger (undecorated) numbers are much less likely to have
* had capturing groups, so they lean more towards a
* charclass. 100 is what this function has traditionally
* used for len>1; khw thinks there is no bias one way or the
* other for length 1 ones. But has chosen 100 for decorated
* identifiers
*
* XXX long enough identifiers could probably return false
* immediately here, rather than using weights. */
if (decorated || len > 1) {
weight -= 100;
}
}
else if (len > 1) {
/* See if there is a known identifier of the given kind. For
* arrays, this might also be a reference to one of its
* elements. XXX Maybe the latter should require a following
* '[' */
if ( is_existing_identifier(tmpbuf, len, s[0], UTF)
|| ( s[0] == '$'
&& is_existing_identifier(tmpbuf, len, '@', UTF)))
{
weight -= 100;

/* khw: Below we keep track of repeated characters; People
* rarely say qr/[aba]/, as the second a is pointless.
* (Some do it though as a mnemonic that is meaningful to
* them.) But generally, repeated characters make things
* more likely to be a charclass. But here, this an
* identifier so likely a subscript. Its spelling should
* be irrelevant to the repeated characters test. So, we
* should advance past it. Suppose it is a hash element,
* like $subscripts{$which}. We should advance past the
* braces and key */
/* khw: Below we keep track of repeated characters;
* People rarely say qr/[aba]/, as the second a is
* pointless. (Some do it though as a mnemonic that is
* meaningful to them.) But generally, repeated characters
* make things more likely to be a charclass. But here,
* this an identifier so likely a subscript. Its spelling
* should be irrelevant to the repeated characters test.
* So, we should advance past it. Suppose it is a hash
* element, like $subscripts{$which}. We should advance
* past the braces and key */
}
else {
/* Not a multi-char identifier already known in the
* program; is somewhat likely to be a subscript.
*
* khw: Our test suite contains several constructs like
* [$A-Z]. Excluding length 1 identifiers in the
* conditional above means such are much less likely to be
* mistaken for subscripts. I would argue that if the next
* character is a '-' followed by an alpha, that would make
* it much more likely to be a charclass. It would only
* make sense to be an expression if that alpha string is a
* bareword with meaning; something like [$A-ord] */
else { /* Isn't a known identifier */
/* Under strict, this means an error. */
if (under_strict_vars) {
return false;
}

/* Otherwise still somewhat likely to be a subscript */
weight -= 10;
}
}
else if ( s[0] == '$'
&& s[1]
&& memCHRs("[#!%*<>()-=", s[1]))
else if ( len == 1
&& s[0] == '$'
&& memCHRs("[#!%*<>()-=", tmpbuf[1]))
{
/* Here we have what could be a punctuation variable. If the
* next character after it is a closing bracket, it makes it
* quite likely to be that, and hence a subscript. If it is
* something else, more mildly a subscript */
if (/*{*/ memCHRs("])} =", s[2]))
if (/*{*/ memCHRs("])} =", tmpbuf[2]))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

accidentally deleted previous comment :(

little bit confused: this happens in branch when strlen of tmpbuf is already checked to be 1, so tmpbuf[2] should \0

weight -= 10;
else
weight -= 1;
}
else {
/* Not a multi-char identifier already known in the program;
* is somewhat likely to be a subscript.
*
* khw: Our test suite contains several constructs like
* [$A-Z]. Excluding length 1 identifiers in the conditional
* above means such are much less likely to be mistaken for
* subscripts. I would argue that if the next character is a
* '-' followed by an alpha, that would make it much more
* likely to be a charclass. It would only make sense to be
* an expression if that alpha string is a bareword with
* meaning; something like [$A-ord] */
weight -= 10;
}
break;

/* khw: [:blank:] strongly indicates a charclass */
Expand All @@ -4812,66 +4884,64 @@ S_intuit_more(pTHX_ char *s, char *e,
* \? must be subscript for things like \d, but not \a.
*/


case '\\':
if (s[1]) {
if (memCHRs("wds]", s[1])) {
weight += 100; /* \w \d \s => strongly charclass */
/* khw: \] can't happen, as any ']' is beyond our search.
* Why not \W \D \S \h \v, etc as well? Should they have
* the same weights as \w \d \s or should all or some be
* in the 'abcfnrtvx' below? */
} else if (seen[(U8)'\''] || seen[(U8)'"']) {
weight += 1;
/* khw: This is problematic. Enough so, that I misread
* it, and added a wrong comment about what it does in
* 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
* doesn't look at the current character. What it
* actually does is: if any quote has been seen in the
* parse, don't do the rest of the else's below, but for
* every subsequent backslashed character encountered
* (except \0 \w \s \d), increment the weight to lean a
* bit more towards being a charclass. That means that
* every backslash sequence following the first occurrence
* of a quote increments the weight regardless of what the
* sequence is. Again, \0 \w \d and \s are not controlled
* by this else, so they change the weight by a lot more.
* But what makes them so special that they aren't subject
* to this. Any why does having a quote change the
* behavior from then on. And why only backslashed
* sequences get this treatment? This code has been
* unchanged since this function was added in 1993. I
* don't get it. Instead, it does seem to me that it is
* especially unlikely to repeat a quote in a charclass,
* but that having just a single quote is indicative of a
* charclass, and having pairs of quotes is indicative of
* a subscript. Similarly for things that could indicate
* nesting of braces or parens. */
}
else if (memCHRs("abcfnrtvx", s[1]))
weight += 40; /* \n, etc => charclass */
/* khw: Why not \e etc as well? */
else if (isDIGIT(s[1])) {
weight += 40; /* \123 => charclass */
while (s[1] && isDIGIT(s[1]))
s++;
}

/* khw: There are lots more possible escape sequences. Some,
* like \A,\z have no special meaning to charclasses, so might
* indicate a subscript, but I don't know what they would be
* doing there either. Some have been added to the language
* after this code was written, but no one thought to, or
* could wade through this function, to add them. Things like
* \p{} for properties, \N and \N{}, for example.
*
* It's problematic that \a is treated as plain 'a' for
* purposes of the 'seen' array. Whatever is matched by these
* backslashed sequences should not be added to 'seen'. That
* includes the backslash. */
}
else /* \ followed by NUL strongly indicates character class */
if (s[1] == '\0') {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally: great change, maybe even add break into then branches and get rid of else if branches

Curious question: common pattern is condition if (s[1]), so why not use if (! s[1] as well

/* \ followed by NUL strongly indicates character class */
weight += 100;
}
else if (memCHRs("wds]", s[1])) {
weight += 100; /* \w \d \s => strongly charclass */
/* khw: \] can't happen, as any ']' is beyond our search. Why
* not \W \D \S \h \v, etc as well? Should they have the same
* weights as \w \d \s or should all or some be in the
* 'abcfnrtvx' below? */
}
else if (seen[(U8)'\''] || seen[(U8)'"']) {
weight += 1;
/* khw: This is problematic. Enough so, that I misread it,
* and added a wrong comment about what it does in
* 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
* doesn't look at the current character. What it actually
* does is: if any quote has been seen in the parse, don't do
* the rest of the else's below, but for every subsequent
* backslashed character encountered (except \0 \w \s \d),
* increment the weight to lean a bit more towards being a
* charclass. That means that every backslash sequence
* following the first occurrence of a quote increments the
* weight regardless of what the sequence is. Again, \0 \w \d
* and \s are not controlled by this else, so they change the
* weight by a lot more. But what makes them so special that
* they aren't subject to this. Any why does having a quote
* change the behavior from then on. And why only backslashed
* sequences get this treatment? This code has been unchanged
* since this function was added in 1993. I don't get it.
* Instead, it does seem to me that it is especially unlikely
* to repeat a quote in a charclass, but that having just a
* single quote is indicative of a charclass, and having pairs
* of quotes is indicative of a subscript. Similarly for
* things that could indicate nesting of braces or parens. */
}
else if (memCHRs("abcfnrtvx", s[1]))
weight += 40; /* \n, etc => charclass */
/* khw: Why not \e etc as well? */
else if (isDIGIT(s[1])) {
weight += 40; /* \123 => charclass */
while (s[1] && isDIGIT(s[1]))
s++;
}

/* khw: There are lots more possible escape sequences. Some, like
* \A,\z have no special meaning to charclasses, so might indicate
* a subscript, but I don't know what they would be doing there
* either. Some have been added to the language after this code
* was written, but no one thought to, or could wade through this
* function, to add them. Things like \p{} for properties, \N and
* \N{}, for example.
*
* It's problematic that \a is treated as plain 'a' for purposes
* of the 'seen' array. Whatever is matched by these backslashed
* sequences should not be added to 'seen'. That includes the
* backslash. */
break;

case '-':
Expand Down
Loading